## SCRAPING ALL LINKS OF MOVIES IN UGC CINEMA

In [3]:
import re
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from datetime import datetime

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

def get_film_links(url):
    links = []
    driver.get(url)
    
    # Try to close the cookie pop-up
    try:
        cookie_button = driver.find_element(By.CSS_SELECTOR, ".hagreed__continue.hagreed-validate")
        cookie_button.click()
        print("Pop-up closed successfully.")
    except Exception as e:
        print("Could not find the cookie pop-up:", e)
        pass
    
    # Wait for the content to load
    time.sleep(1)  # Adjust this based on how long it takes to load

    # Get the page source after full page load
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Find all divs with the class "dates-contents"
    dates_contents_div = soup.find('div', class_='dates-content')

    if not dates_contents_div:
        print("Could not find the div with class 'dates-contents'.")
        return links

    # Find all div blocks for movies inside the "dates-contents" div
    movie_blocks = dates_contents_div.find_all('div', class_='block--title text-uppercase')

    # Extract the links from each movie block
    for block in movie_blocks:
        base_url = 'https://www.ugc.fr/'
        a_tag = block.find('a', href=True)
        if a_tag:
            links.append(base_url + a_tag['href'])  # Save the movie link
    
    return links

# URL of the page to scrape
url = 'https://www.ugc.fr/cinema-ugc-cine-cite-strasbourg.html'

# Get the film links
films = get_film_links(url)


# Print all the links found
print(films)
driver.quit()


Pop-up closed successfully.
['https://www.ugc.fr/film_une_pointe_d_amour_16777.html?cinemaId=30', 'https://www.ugc.fr/film_tu_ne_mentiras_point_16997.html?cinemaId=30', 'https://www.ugc.fr/film_les_indomptes_17035.html?cinemaId=30', 'https://www.ugc.fr/film_les_linceuls_16251.html?cinemaId=30', 'https://www.ugc.fr/film_les_regles_de_l_art_16929.html?cinemaId=30', 'https://www.ugc.fr/film_little_jaffna_15790.html?cinemaId=30', 'https://www.ugc.fr/film_thunderbolts_16570.html?cinemaId=30', 'https://www.ugc.fr/film_ma_mere_dieu_et_sylvie_vartan_16715.html?cinemaId=30', 'https://www.ugc.fr/film_des_jours_meilleurs_16566.html?cinemaId=30', 'https://www.ugc.fr/film_la_chambre_de_mariana_16532.html?cinemaId=30', 'https://www.ugc.fr/film_sinners_16774.html?cinemaId=30', 'https://www.ugc.fr/film_zion_16974.html?cinemaId=30', 'https://www.ugc.fr/film_drop_game_16482.html?cinemaId=30', 'https://www.ugc.fr/film_minecraft_le_film_16537.html?cinemaId=30', 'https://www.ugc.fr/film_12_years_a_slave_98

## SCRAPING ALL THE NEEDED INFORMATIONS

In [19]:
def scrape_film_details(driver, films_link):
    all_films_UGC = pd.DataFrame()

    jour_translation = {
        'Monday': 'Lundi',
        'Tuesday': 'Mardi',
        'Wednesday': 'Mercredi',
        'Thursday': 'Jeudi',
        'Friday': 'Vendredi',
        'Saturday': 'Samedi',
        'Sunday': 'Dimanche'
    }
    
    for link in films_link:
        driver.get(link)
        wait = WebDriverWait(driver, 4)

        try:
            cookie_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".hagreed__continue.hagreed-validate")))
            cookie_button.click()
        except Exception:
            pass

        # Title
        try:
            titre = driver.find_element(By.XPATH, '//*[@id="film-presentation"]/div[1]/div/div/div/div[2]/h1').text.strip()
        except:
            continue

        # Genre
        try:
            group_info = driver.find_element(By.CSS_SELECTOR, "div.group-info.d-none.d-md-block p.color--dark-blue").text
            genre = group_info.split('·')[0].strip()
        except:
            genre = "Not yet revealed"

        try:
            dates_nav = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "dates-nav")))
            slider_items = dates_nav.find_elements(By.XPATH, './/div[starts-with(@class, "slider-item")]')
        except Exception:
            continue

        lst = []

        for item in slider_items:
            try:

                item_id = item.get_attribute("id")
                date_match = re.search(r'(\d{4}-\d{2}-\d{2})', item_id)
                if not date_match:
                    continue

                date_text = date_match.group(1)  # e.g. '2025-05-05'
                date_obj = datetime.strptime(date_text, '%Y-%m-%d')
                jour = jour_translation.get(date_obj.strftime('%A'), date_obj.strftime('%A'))
                date_formatted = date_obj.strftime('%d/%m/%Y')

                driver.execute_script("arguments[0].scrollIntoView(true);", item)
                time.sleep(0.2)
                driver.execute_script("arguments[0].click();", item)

                WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CLASS_NAME, "screening-start")))
                horaires = [elem.text.strip() for elem in driver.find_elements(By.CLASS_NAME, "screening-start") if elem.text.strip()]

                for hor in horaires:
                    lst.append({
                        "titre": titre,
                        "genre": genre,
                        "date": date_formatted,
                        "jour": jour,
                        "horaire": hor
                    })

            except Exception:
                continue

        if lst:
            df = pd.DataFrame(lst)
            all_films_UGC = pd.concat([all_films_UGC, df], ignore_index=True)

        time.sleep(0.2)

    return all_films_UGC

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
films_data_UGC = scrape_film_details(driver, films)
display(films_data_UGC)
driver.quit()

Unnamed: 0,titre,genre,date,jour,horaire
0,UNE POINTE D'AMOUR,Comédie,06/05/2025,Mardi,18:00
1,UNE POINTE D'AMOUR,Comédie,06/05/2025,Mardi,20:00
2,UNE POINTE D'AMOUR,Comédie,06/05/2025,Mardi,22:00
3,UNE POINTE D'AMOUR,Comédie,07/05/2025,Mercredi,11:00
4,UNE POINTE D'AMOUR,Comédie,07/05/2025,Mercredi,13:30
...,...,...,...,...,...
872,PEPPA RENCONTRE LE BEBE AU CINEMA,"Famille, Comédie Musicale, Animation",01/06/2025,Dimanche,10:45
873,DAN DA DAN : EVIL EYE,"Action, Science Fiction, Animation",07/06/2025,Samedi,16:00
874,DAN DA DAN : EVIL EYE,"Action, Science Fiction, Animation",08/06/2025,Dimanche,13:45
875,SUR LA ROUTE DE PAPA,Comédie\nSortie le 18 juin 2025\nDe Nabil Aita...,12/06/2025,Jeudi,20:30
