## WebScrapping for all cities except Almaty

In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
import os

def select_city(driver, city_name):
    driver.get("https://kino.kz/ru/movie")
    print('timer 15 sec started -------------------')
    time.sleep(15)
    try:
        city_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//button[.//span[text()='Алматы']]"))
        )
        city_button.click()
        print('timer 15 sec started -------------------')
        time.sleep(15)
        
        city_select = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, f"//button[.//span[text()='{city_name}']]"))
        )
        city_select.click()
        print('timer 15 sec started -------------------')
        time.sleep(15)
    except Exception as e:
        print(f"Ошибка при выборе города: {e}")

def get_cinema_ids(driver):
    cinema_ids = []
    cinema_names = []
    try:
        cinema_elements = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a.lmejjw3"))
        )
        for cinema in cinema_elements:
            cinema_link = cinema.get_attribute("href")
            cinema_id = cinema_link.split("/")[-1]  # Получаем ID кинотеатра из URL
            cinema_name = cinema.text.strip()
            cinema_ids.append((cinema_id, cinema_name))
    except Exception as e:
        print(f"Ошибка при получении списка кинотеатров: {e}")
    
    return cinema_ids

def get_df(city_name):
    currend_date = time.strftime("%Y-%m-%d")
    file_path = os.path.join(currend_date, "kino_schedule_combined.xlsx")
    try:
        combined_df = pd.read_excel(file_path)
    except FileNotFoundError:
        # Create an empty DataFrame
        combined_df = pd.DataFrame()
        
    print("File loaded or created successfully!")

    # Настройка драйвера
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    schedule = []
    select_city(driver, city_name)
    print('1 - city name', city_name)
    # Получение списка ID кинотеатров
    cinema_list = get_cinema_ids(driver)
    base_url = "https://kino.kz/cinema/{}"
    for cinema_id, cinema_name in cinema_list:
        url = base_url.format(cinema_id)
        driver.get(url)
        print('2 - cinema name', cinema_name)
        try:
            movies = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a.idu43p0"))
            )
            
            for movie in movies:
                try:
                    movie_name = movie.find_element(By.CSS_SELECTOR, "span.rt-Text.rt-r-size-3.rt-r-weight-bold.idu43p1").text
                    sessions = movie.find_elements(By.XPATH, "./following-sibling::div[@class='rt-Flex rt-r-fd-column sm:rt-r-fd-row rt-r-ai-stretch sm:rt-r-ai-center rt-r-jc-space-between rt-r-gap-2 nn6vvh4']")
                    if len(sessions) == 0:
                        sessions = movie.find_elements(By.XPATH, "./following-sibling::a")
                    
                    print('3 - movie name', movie_name)
                    print('4 - sessions length', len(sessions))
                    for session in sessions:
                        try:
                            time_text = session.find_element(By.CSS_SELECTOR, "span.nn6vvh6").text
                            hall_element = session.find_elements(By.CSS_SELECTOR, "span.rt-Text.rt-r-size-3.rt-r-weight-bold.nn6vvhb")
                            hall = hall_element[0].text if hall_element else "Не указано"
                            language_element = session.find_elements(By.CSS_SELECTOR, "span.rt-reset.rt-Badge.rt-r-size-1.rt-variant-outline")
                            language = language_element[0].text if language_element else "Не указано"
                            prices_elements = session.find_elements(By.CSS_SELECTOR, "div.rt-Box div.rt-Text.rt-r-size-2.rt-r-weight-medium.nn6vvh1")
                            prices = [price.text if price.text != "–" else "0 ₸" for price in prices_elements]
                            
                            while len(prices) < 4:
                                prices.append("0 ₸")
                            
                            schedule.append({
                                "date": time.strftime("%Y-%m-%d"),
                                
                                "city": city_name,
                                "cinema_id": cinema_id,
                                "cinema_name": cinema_name,
                                "movie": movie_name,
                                "time": time_text, 
                                "hall": hall,
                                "language": language,
                                "price_adult": prices[0],
                                "price_child": prices[1],
                                "price_student": prices[2],
                                "price_vip": prices[3],
                                "weekday": 1
                            })
                            time.sleep(1)
                        except Exception as e:
                            print(f"Ошибка при обработке сеанса в кинотеатре {cinema_id}: {e}")
                    print('timer 1 sec started -------------------')
                    time.sleep(1)            
                except Exception as e:
                    print(f"Ошибка при обработке фильма в кинотеатре {cinema_id}: {e}")
        except Exception as e:
            print(f"Ошибка при загрузке страницы кинотеатра {cinema_id}: {e}")
    print('timer 1 sec started -------------------')
    time.sleep(15)
        


    # Завершаем работу драйвера
    driver.quit()
    print("schedule",schedule)

    df = pd.DataFrame(schedule)
    df.head()
    combined_df = pd.concat([combined_df, df], ignore_index=True)
    print(combined_df)
    
    
    if not os.path.exists(currend_date):
        os.makedirs(currend_date)
    test_n = "kino_schedule_"+city_name + ".xlsx"
    file_path = os.path.join(currend_date, test_n)

    # Сохранение результата в Excel
    df.to_excel(file_path, index=False)
    
    combined_df.to_excel(os.path.join(currend_date, "kino_schedule_combined.xlsx"), index=False)

    return df



'''



"", "Тараз", "Талгар", "Темиртау", "Усть-Каменогорск"
"Астана","Актау", "Актобе", "Аксай", "Атырау", "Алаколь",
"Балхаш", "Бейнеу",
    "Жанаозен", "Жезказган", "Жетысай", "Житикара", 
    "Караганда", "Кокшетау", "Конаев", "Костанай","Кызылорда", 
    "Павлодар", "Петропавловск", "Риддер", 
    "Сатпаев", "Семей", "Степногорск",
    "Талдыкорган", "Талгар", "Тараз", "Темиртау", "Туркестан", 
    "Уральск", "Усть-Каменогорск",
    "Шиели", "Шымкент", "Щучинск", 
    "Форт-Шевченко", "Экибастуз", "Рудный"
'''

dfs_petropavlov = get_df("Петропавловск")
dfs_astan = get_df("Астана")
dfs_aktau = get_df("Актау")
dfs_akto = get_df("Актобе")
dfs_aksay = get_df("Аксай")
dfs_atyr = get_df("Атырау")
dfs_alakol = get_df("Алаколь")
dfs_balh = get_df("Балхаш")
dfs_beyneu = get_df("Бейнеу")    
dfs_jan = get_df("Жанаозен")    
dfs_jez = get_df("Жезказган")    
dfs_jit = get_df("Житикара")    
dfs_kar = get_df("Караганда")    
dfs_kok = get_df("Кокшетау")    
dfs_konaev = get_df("Конаев")    
dfs_kost = get_df("Костанай")    
dfs_kyz = get_df("Кызылорда")    
dfs_pavl = get_df("Павлодар")      
dfs_rid = get_df("Риддер")    
dfs_sat = get_df("Сатпаев")    
dfs_sem = get_df("Семей")    
dfs_step = get_df("Степногорск")    
dfs_tal = get_df("Талдыкорган")    
dfs_talgar = get_df("Талгар")    
dfs_taraz = get_df("Тараз")    
dfs_temp = get_df("Темиртау")    
dfs_tur = get_df("Туркестан")    
dfs_ur = get_df("Уральск")    
dfs_ust = get_df("Усть-Каменогорск")    
dfs_shie = get_df("Шиели")    
dfs_shym = get_df("Шымкент")
dfs_shuchinsk = get_df("Щучинск")
dfs_fort = get_df("Форт-Шевченко")
dfs_ekibastuz = get_df("Экибастуз")
dfs_rud = get_df("Рудный")




File loaded or created successfully!
timer 15 sec started -------------------
timer 15 sec started -------------------
timer 15 sec started -------------------
1 - city name Шымкент
2 - cinema name Cinemax Shymkent Multiplex
пл. Аль-Фараби, 3/1, ТРЦ «Shymkent Plaza»
3 - movie name Ночь в зоопарке
4 - sessions length 1
timer 1 sec started -------------------
3 - movie name Школа магических зверей. Загадка волшебного леса
4 - sessions length 2
timer 1 sec started -------------------
3 - movie name Каяра. Путь судьбы
4 - sessions length 3
timer 1 sec started -------------------
3 - movie name Приключения Паддингтона 3
4 - sessions length 3
timer 1 sec started -------------------
3 - movie name Догмен: Пушистая справедливость
4 - sessions length 5
timer 1 sec started -------------------
3 - movie name Роковая экспедиция
4 - sessions length 2
timer 1 sec started -------------------
3 - movie name Janym Qazaqstan
4 - sessions length 1
timer 1 sec started -------------------
3 - movie name Мә

## Web Scrapping for Almaty

In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import os
import time

def select_city(driver, city_name):
    driver.get("https://kino.kz/ru/movie")
    print('timer 15 sec started -------------------')
    time.sleep(5)
    try:
        city_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//button[.//span[text()='Астана']]"))
        )
        city_button.click()
        print('timer 15 sec started -------------------')
        time.sleep(5)
        
        city_select = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, f"//button[.//span[text()='{city_name}']]"))
        )
        city_select.click()
        print('timer 15 sec started -------------------')
        time.sleep(5)
    except Exception as e:
        print(f"Ошибка при выборе города: {e}")

def get_cinema_ids(driver):
    cinema_ids = []
    cinema_names = []
    try:
        cinema_elements = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a.lmejjw3"))
        )
        for cinema in cinema_elements:
            cinema_link = cinema.get_attribute("href")
            cinema_id = cinema_link.split("/")[-1]  # Получаем ID кинотеатра из URL
            cinema_name = cinema.text.strip()
            cinema_ids.append((cinema_id, cinema_name))
    except Exception as e:
        print(f"Ошибка при получении списка кинотеатров: {e}")
    
    return cinema_ids
def get_df(city_name):
    # Настройка драйвера
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    schedule = []
    select_city(driver, city_name)
    print('1', city_name)
    # Получение списка ID кинотеатров
    cinema_list = get_cinema_ids(driver)
    base_url = "https://kino.kz/cinema/{}"
    for cinema_id, cinema_name in cinema_list:
        url = base_url.format(cinema_id)
        driver.get(url)
        print('2', cinema_name)
        try:
            movies = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a.idu43p0"))
            )
            
            for movie in movies:
                try:
                    movie_name = movie.find_element(By.CSS_SELECTOR, "span.rt-Text.rt-r-size-3.rt-r-weight-bold.idu43p1").text
                    sessions = movie.find_elements(By.XPATH, "./following-sibling::a")
                    print('3', movie_name)
                    for session in sessions:
                        try:
                            time_text = session.find_element(By.CSS_SELECTOR, "span.nn6vvh6").text
                            hall = session.find_element(By.CSS_SELECTOR, "span.rt-Text.rt-r-size-3.rt-r-weight-bold.nn6vvhb").text
                            language_element = session.find_elements(By.CSS_SELECTOR, "span.rt-reset.rt-Badge.rt-r-size-1.rt-variant-outline")
                            language = language_element[0].text if language_element else "Не указано"
                            prices_elements = session.find_elements(By.CSS_SELECTOR, "div.rt-Box div.rt-Text.rt-r-size-2.rt-r-weight-medium.nn6vvh1")
                            prices = [price.text if price.text != "–" else "0 ₸" for price in prices_elements]
                            
                            while len(prices) < 4:
                                prices.append("0 ₸")
                            
                            schedule.append({
                                "date": time.strftime("%Y-%m-%d"),
                                "city": city_name,
                                "cinema_id": cinema_id,
                                "cinema_name": cinema_name,
                                "movie": movie_name,
                                "time": time_text, 
                                "hall": hall,
                                "language": language,
                                "price_adult": prices[0],
                                "price_child": prices[1],
                                "price_student": prices[2],
                                "price_vip": prices[3],
                                "weekday": 1
                            })
                            time.sleep(1)
                        except Exception as e:
                            print(f"Ошибка при обработке сеанса в кинотеатре {cinema_id}: {e}")
                    print('timer 1 sec started -------------------')
                    time.sleep(1)            
                except Exception as e:
                    print(f"Ошибка при обработке фильма в кинотеатре {cinema_id}: {e}")
        except Exception as e:
            print(f"Ошибка при загрузке страницы кинотеатра {cinema_id}: {e}")
    print('timer 1 sec started -------------------')
    time.sleep(15)
        


    # Завершаем работу драйвера
    driver.quit()

    # Сохранение в Excel
    df = pd.DataFrame(schedule)
    return df

currend_date = time.strftime("%Y-%m-%d")

city_name = "Алматы"
dfs_alm = get_df(city_name)
dfs_concatenated = pd.concat([dfs_alm], ignore_index=True)

if not os.path.exists(currend_date):
   os.makedirs(currend_date)
test_n = "kino_schedule_combined_"+city_name + ".xlsx"
file_path = os.path.join(currend_date, test_n)

# Сохранение результата в Excel
dfs_concatenated.to_excel(file_path, index=False)





timer 15 sec started -------------------
Ошибка при выборе города: Message: 
Stacktrace:
	GetHandleVerifier [0x005E74A3+25091]
	(No symbol) [0x0056DC04]
	(No symbol) [0x0044B373]
	(No symbol) [0x0048F4DC]
	(No symbol) [0x0048F65B]
	(No symbol) [0x004CD8E2]
	(No symbol) [0x004B1F54]
	(No symbol) [0x004CB49E]
	(No symbol) [0x004B1CA6]
	(No symbol) [0x004831D5]
	(No symbol) [0x0048435D]
	GetHandleVerifier [0x008E07C3+3142947]
	GetHandleVerifier [0x008F1A2B+3213195]
	GetHandleVerifier [0x008EC412+3191154]
	GetHandleVerifier [0x00688720+685184]
	(No symbol) [0x00576E1D]
	(No symbol) [0x00573E18]
	(No symbol) [0x00573FB6]
	(No symbol) [0x005666F0]
	BaseThreadInitThunk [0x758D7BA9+25]
	RtlInitializeExceptionChain [0x7760C0CB+107]
	RtlClearBits [0x7760C04F+191]

1 Алматы
2 Halyk IMAX Kinopark 16
Кульджинский тракт, 106, молл Aport East, 2 этаж
3 Муфаса: Король Лев
timer 1 sec started -------------------
3 Школа магических зверей. Загадка волшебного леса
timer 1 sec started -------------------
