In [1]:
pip install -U duckduckgo_search

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install urlopen

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
from duckduckgo_search import DDGS
from datetime import date
import time
import re

# Fetch and parse the webpage
url = "https://collider.com/andrei-tarkovsky-movies-ranked-imdb/"
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")

# Extract page title
page_title_dirty = soup.title.get_text(strip=True)
page_title = re.sub(r", Ranked.*", "", page_title_dirty)

# Find and download the large header image
image_path = "assets/images/tarkovsky-banner.jpg"
search_image = list(DDGS().images(f"tarkovsky the criterion chanel", layout="Wide", max_results=1))
image_url = search_image[0]['image']

os.makedirs("assets/images", exist_ok=True)
response = requests.get(image_url)
with open(image_path, "wb") as img_file:
    img_file.write(response.content)

# Create directories
os.makedirs("_posts", exist_ok=True)

# Create _config.yml for Jekyll configuration
with open("_config.yml", "w", encoding="utf-8") as config_file:
    config_file.write(f"""\
title: "Top Tarkovsky Movies"
description: "A collection of Andrei Tarkovsky's movies"
theme: minimal-mistakes-jekyll
minimal_mistakes_skin: default
author:
  name: "Pavel Charheika"
paginate: 5
permalink: /:title/
plugins:
  - jekyll-feed
  - jekyll-seo-tag
  - jekyll-sitemap
  - jekyll-include-cache
""")
    
# Create Gemfile for Jekyll configuration
with open("Gemfile", "w", encoding="utf-8") as config_file:
    config_file.write(f"""\
source "https://rubygems.org"

gem "github-pages", group: :jekyll_plugins
gem "jekyll-include-cache", group: :jekyll_plugins
""")

# Prepare the movie data
titles = []
data = []
movies = soup.find_all('h2')

for movie in movies:
    span = movie.find('span')
    if not span:
        continue  # Skip if there's no <span>

    rank = span.get_text(strip=True)
    
    title_span = span.find_next_sibling()
    title = title_span.get_text(strip=True) if title_span else "Unknown"

    rating_tag = movie.find_next_sibling('h3')
    rating = rating_tag.get_text(strip=True) if rating_tag else "N/A"

    description_tag = movie.find_next_sibling('p')
    sentence1 = description_tag.get_text(strip=True) if description_tag else "No description"
    
    sentence2 = "No description"
    if description_tag:
        next_p = description_tag.find_next_sibling('p')
        if next_p:
            sentence2 = next_p.get_text(strip=True)
    
    
    time.sleep(2)
    # Search for IMDb link
    search_result = DDGS().text(f"{title} - IMDb", max_results=1)
    link = search_result[0]['href'] if search_result else "#"

    time.sleep(2)

    search_image = list(DDGS().images(f"{title} filmaffinity", layout="Wide", max_results=1))

    clean_title = re.sub(r"['\"]?(.*?)['\"]? \(\d{4}\)", r"\1", title)
    image_url = search_image[0]['image']  
    images_path = f"assets/images/{clean_title}.jpg"

    response = requests.get(image_url)
    with open(images_path, "wb") as img_file:
        img_file.write(response.content)

    movie_info = {
        "Rank": rank,
        "Title": clean_title,
        "Rating": rating,
        "Sentence1": sentence1,
        "Sentence2": sentence2,
        "Link": link,
        "Image": images_path
    }
    titles.append(title)
    data.append(movie_info)

# Create index.md (Home page)
with open("index.md", "w", encoding="utf-8") as index_file:
    index_file.write(f"""\
---
layout: home
title: "{page_title}"
header:
  overlay_image: "/{image_path}"
  caption: "Andrei Tarkovsky's Best Movies"
---

## List of Tarkovsky's Movies

""")

    for movie in data:
        filename = f"_posts/{date.today()}-{movie['Title'].replace(' ', '_')}.md"

        # Create individual movie pages
        with open(filename, "w", encoding="utf-8") as post_file:
            post_file.write(f"""\
---
title: "{movie['Title']}"
date: {date.today()}
layout: single
header:
  image: "/{movie['Image']}"
  filter: linear-gradient(rgba(255, 0, 0, 0.5), rgba(0, 255, 255, 0.5))
  caption: "A still from {movie['Title']}"
---

## {movie['Rank']} - {movie['Title']}

**Rating:** {movie['Rating']}

{movie['Sentence1']}

{movie['Sentence2']}

[IMDb - {movie['Title']}]({movie['Link']})
""")

print("Jekyll site files have been generated.")



Jekyll site files have been generated.
