In [None]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import re
from collections import Counter
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [None]:
base_url = "https://github.com/PacktPublishing?page={}&tab=repositories"
headers = {"User-Agent": "Mozilla/5.0"}
all_data = []

for page in tqdm(range(1, 298)):  # страницы 1-297
    url = base_url.format(page)
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.text, "html.parser")

    repos = soup.select("li[itemprop='owns']")

    for repo in repos:
        title_tag = repo.select_one("a[itemprop='name codeRepository']")
        if not title_tag:
            continue
        title = title_tag.text.strip()
        href = "https://github.com" + title_tag['href']
        stars_tag = repo.select_one("a[href$='/stargazers']")
        stars = stars_tag.text.strip() if stars_tag else "0"
        all_data.append((title, href, stars))


In [None]:
all_words = []
for title, _, _ in all_data:
    words = re.findall(r'\b\w+\b', title.lower())
    all_words.extend(words)

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time

options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=options)

# Загружаем страницу
url = "https://github.com/PacktPublishing?page=1&tab=repositories"
driver.get(url)
time.sleep(3)

soup = BeautifulSoup(driver.page_source, "html.parser")

# Новый селектор: ищем ссылки на репозитории
repos = soup.select("a[href^='/PacktPublishing/']")

print("🔍 Найдено репозиториев:", len(repos))
for a in repos[:5]:
    title = a.text.strip()
    href = "https://github.com" + a['href']
    print("-", title, "→", href)

driver.quit()


In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
import pandas as pd
from tqdm import tqdm

options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=options)

data = []

for page in tqdm(range(1, 298)):  # Страницы 1~297
    url = f"https://github.com/PacktPublishing?page={page}&tab=repositories"
    driver.get(url)
    time.sleep(2.5)  # обязательно, иначе элементы не успеют появиться

    soup = BeautifulSoup(driver.page_source, "html.parser")
    repos = soup.select("a[href^='/PacktPublishing/']")

    for a in repos:
        title = a.text.strip()
        href = "https://github.com" + a['href']
        if title:  # пропускаем пустые ссылки
            data.append((title, href))

driver.quit()

# 📊 Сохраним в таблицу
df = pd.DataFrame(data, columns=["Title", "URL"])
df.to_excel("packt_repos.xlsx", index=False)
print("✅ Готово! Данные сохранены в 'packt_repos.xlsx'")


In [None]:
import re
from collections import Counter

all_words = []
for title in df['Title']:
    words = re.findall(r'\b\w+\b', title.lower())  # регулярка = только слова
    all_words.extend(words)

top10 = Counter(all_words).most_common(10)
print("🔝 Топ-10 слов:", top10)


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

stopwords = set([
    "and", "with", "the", "for", "from", "using", "in", "of", "to", "on", "by",
    "a", "an", "at", "as", "is", "this", "that", "you", "your"
])

filtered = [w for w in all_words if w not in stopwords]
text = " ".join(filtered)

wc = WordCloud(width=800, height=400, background_color='white').generate(text)

plt.figure(figsize=(12, 6))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()


In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time
from tqdm import tqdm

options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(options=options)

data = []

for page in tqdm(range(1, 298)):  # 1~297
    url = f"https://github.com/PacktPublishing?page={page}&tab=repositories"
    driver.get(url)
    time.sleep(2.5)

    soup = BeautifulSoup(driver.page_source, "html.parser")
    repos = soup.select("li[itemprop='owns']")

    for repo in repos:
        title_tag = repo.select_one("a[itemprop='name codeRepository']")
        if not title_tag:
            continue
        title = title_tag.text.strip()
        href = "https://github.com" + title_tag['href']

        star_tag = repo.select_one("a[href$='/stargazers']")
        stars = star_tag.text.strip() if star_tag else "0"

        data.append((title, href, stars))

driver.quit()

# 💾 Сохраняем в Excel
df = pd.DataFrame(data, columns=["Title", "URL", "Stars"])
df.to_excel("packt_books.xlsx", index=False)
print("✅ Данные сохранены в packt_books.xlsx")
