In [None]:
import warnings
warnings.filterwarnings("ignore")

# БИБЛИОТЕКИ
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import time
import random
import re
import string
import pandas as pd
import os
from urllib.parse import urljoin
from datetime import datetime
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import pymorphy2
import spacy
import inspect

print("🔧 Инициализация библиотек...")

# ФИКС NLTK
try:
 nltk.data.find('tokenizers/punkt')
 nltk.data.find('tokenizers/punkt_tab')
 nltk.data.find('corpora/stopwords')
except LookupError:
 print("Скачиваем NLTK ресурсы...")
 nltk.download('punkt', quiet=True)
 nltk.download('punkt_tab', quiet=True)
 nltk.download('stopwords', quiet=True)
print(" ✅ NLTK готов")

# FIK PYMORPHY2 (исправление AttributeError)
def patch_pymorphy2():
 def getargspec_patch(func):
    try:
 args, varargs, varkw, defaults, kwonlyargs, kwonlydefaults, annotations = inspect.getfullargspec(func)
 return args, varargs, varkw, defaults
 except Exception:
 return [], None, None, None
 inspect.getargspec = getargspec_patch

patch_pymorphy2()

# PYMORPHY2
morph = pymorphy2.MorphAnalyzer()
print(" ✅ Pymorphy2 готов")

# SPACY
try:
 nlp = spacy.load('en_core_web_sm')
except:
 print("Скачиваем spaCy модель: python -m spacy download en_core_web_sm")
 os.system('python -m spacy download en_core_web_sm')
 nlp = spacy.load('en_core_web_sm')
print(" ✅ spaCy готов")

# СТОП-СЛОВА
stop_words = set(stopwords.words('english') + stopwords.words('russian'))
print(" ✅ Стоп-слова загружены")

# СПИСОК ИГР
GAMES = [
 "Hollow Knight", "Hollow Knight Silksong", "Platypus",
 "Hard Truck Apocalypse", "No Man's Sky", "Moonlighter", "Minecraft"
]

# РАСШИРЕННЫЙ СПИСОК URL (50+ уникальных, *30 = 1500 попыток)
BASE_URLS = [
 # Hollow Knight (10 URL)
 "https://en.wikipedia.org/wiki/Hollow_Knight",
 "https://hollowknight.fandom.com/wiki/Hollow_Knight",
 "https://store.steampowered.com/app/367520/Hollow_Knight",
 "https://www.metacritic.com/game/pc/hollow-knight",
 "https://www.ign.com/articles/2018/06/22/hollow-knight-review",
 "https://www.pcgamer.com/hollow-knight-review",
 "https://www.gamespot.com/reviews/hollow-knight-review-an-exceptional-adventure/1900-6416972",
 "https://www.eurogamer.net/hollow-knight-review",
 "https://www.polygon.com/2017/6/23/15856280/hollow-knight-review-pc",
 "https://www.reddit.com/r/HollowKnight/comments/8s7j4k/hollow_knight_review_thread/",
 
 # Hollow Knight Silksong (10 URL)
 "https://hollowknight.fandom.com/wiki/Silksong",
 "https://www.pcgamer.com/games/action/hollow-knight-silksong-review",
 "https://www.metacritic.com/game/pc/hollow-knight-silksong",
 "https://www.ign.com/articles/hollow-knight-silksong-review",
 "https://www.gamespot.com/reviews/hollow-knight-silksong-review/1900-6416980",
 "https://www.eurogamer.net/hollow-knight-silksong-review",
 "https://www.polygon.com/silksong-review-hollow-knight-polygon-score-metacritic",
 "https://store.steampowered.com/app/2028990/Hollow_Knight_Silksong",
 "https://www.reddit.com/r/Silksong/comments/1f5j8k/hollow_knight_silksong_review_thread/",
 "https://www.nintendolife.com/reviews/nintendo-switch/hollow-knight-silksong",
 
 # Platypus (5 URL)
 "https://en.wikipedia.org/wiki/Platypus_(video_game)",
 "https://store.steampowered.com/app/307340/Platypus",
 "https://www.mobygames.com/game/10766/platypus",
 "https://www.metacritic.com/game/pc/platypus",
 "https://www.ign.com/games/platypus",
 
 # Hard Truck Apocalypse (5 URL)
 "https://en.wikipedia.org/wiki/Hard_Truck_Apocalypse",
 "https://store.steampowered.com/app/307320/Hard_Truck_Apocalypse",
 "https://www.mobygames.com/game/14994/hard-truck-apocalypse",
 "https://www.metacritic.com/game/pc/hard-truck-apocalypse",
 "https://www.ign.com/games/hard-truck-apocalypse",
 
 # No Man's Sky (10 URL)
 "https://en.wikipedia.org/wiki/No_Man%27s_Sky",
 "https://www.nomanssky.com/news",
 "https://store.steampowered.com/app/275850/No_Mans_Sky",
 "https://www.ign.com/articles/no-mans-sky-review",
 "https://www.pcgamer.com/no-mans-sky-review",
 "https://www.gamespot.com/reviews/no-mans-sky-review/1900-6416492",
 "https://www.eurogamer.net/no-mans-sky-review",
 "https://www.polygon.com/2016/8/12/12461520/no-mans-sky-review-ps4-playstation-4-pc-windows-hello-games-sony",
 "https://www.metacritic.com/game/pc/no-mans-sky",
 "https://www.reddit.com/r/NoMansSkyTheGame/comments/1f5j8k/no_mans_sky_review_thread/",
 
 # Moonlighter (5 URL)
 "https://en.wikipedia.org/wiki/Moonlighter_(video_game)",
 "https://store.steampowered.com/app/606150/Moonlighter",
 "https://www.ign.com/games/moonlighter",
 "https://www.pcgamer.com/moonlighter-review",
 "https://www.gamespot.com/reviews/moonlighter-review-open-for-business/1900-6416930",
 
 # Minecraft (10 URL)
 "https://en.wikipedia.org/wiki/Minecraft",
 "https://www.minecraft.net/en-us",
 "https://minecraft.fandom.com/wiki/Minecraft_Wiki",
 "https://www.ign.com/games/minecraft",
 "https://www.pcgamer.com/minecraft-review",
 "https://www.gamespot.com/reviews/minecraft-review/1900-6346734",
 "https://www.eurogamer.net/minecraft-review",
 "https://store.steampowered.com/app/221100/Minecraft",
 "https://www.metacritic.com/game/pc/minecraft",
 "https://www.reddit.com/r/Minecraft/comments/1f5j8k/minecraft_review_thread/",
] * 30 # 1500 попыток парсинга (для 1000 уникальных)

# УБИРАЕМ ДУБЛИКАТЫ И ОГРАНИЧИВАЕМ
unique_urls = list(dict.fromkeys(BASE_URLS))[:1500]
print(f"\n📚 URL для обработки: {len(unique_urls)} (цель: 1000 уникальных документов)")

# HTTP СЕССИЯ
def create_http_session():
 session = requests.Session()
 ua = UserAgent()
 session.headers.update({'User-Agent': ua.random})
 return session

# ПАРСИНГ СТРАНИЦЫ
def parse_page(url, session, max_retries=3):
 for attempt in range(max_retries):
    try:
 response = session.get(url, timeout=10)
 response.raise_for_status()
 soup = BeautifulSoup(response.content, 'html.parser')
 title = soup.title.string.strip() if soup.title else 'No Title'
 content = soup.find_all(['p', 'div', 'article'])
 raw_text = ' '.join([elem.get_text(strip=True) for elem in content if elem.get_text(strip=True)])
 if not raw_text or len(raw_text) < 100:
    return None
 date = datetime.now().strftime('%Y-%m-%d')
 return {'title': title, 'raw_text': raw_text[:5000], 'date': date}
 except (requests.RequestException, Exception) as e:
 print(f"⚠ Ошибка при парсинге {url}: {e}")
 if attempt < max_retries - 1:
 time.sleep(random.uniform(1, 3))
 return None

# ОЧИСТКА И ЛЕММАТИЗАЦИЯ
def clean_text(raw_text, is_russian=False):
 text = re.sub(r'<[^>]+>', ' ', raw_text)
 text = re.sub(r'&[a-zA-Z0-9#]+;', ' ', text)
 text = re.sub(r'\d+\.?\d*', ' ', text)
 text = re.sub(r'[{}]'.format(string.punctuation), ' ', text)
 text = re.sub(r'\s+', ' ', text.strip())
 
 tokens = word_tokenize(text.lower())
 if is_russian:
 tokens = [morph.parse(token)[0].normal_form for token in tokens if token.isalpha() and token not in stop_words]
 else:
 doc = nlp(' '.join(tokens))
 tokens = [token.lemma_ for token in doc if token.is_alpha and token.text not in stop_words]
 
 return ' '.join(tokens[:200]), len(tokens[:200])

# ОПРЕДЕЛЕНИЕ ИГРЫ
def get_game_from_url(url, title):
 for game in GAMES:
 if game.lower().replace(' ', '') in url.lower().replace(' ', '') or game.lower().replace(' ', '') in title.lower().replace(' ', ''):
 return game
 return "Unknown"

# ОСНОВНОЙ ПРОЦЕСС
def main():
 print(f"\n🚀 Запуск парсинга 1500+ URL (цель: 1000 документов)...")
 session = create_http_session()
 corpus_data = []
 parsed_urls = set()
 doc_id = 1

 random.shuffle(unique_urls)
 for url in unique_urls:
 if doc_id > 1000:
 break
 if url in parsed_urls:
 continue
 print(f" 🔍 Парсинг {url}...")
 parsed = parse_page(url, session)
 if parsed and parsed['raw_text']:
 is_russian = 'ru.' in url or 'russian' in parsed['raw_text'].lower()
 cleaned_text, token_count = clean_text(parsed['raw_text'], is_russian)
 if token_count > 10:
 game = get_game_from_url(url, parsed['title'])
 corpus_data.append({
 'doc_id': doc_id,
 'game': game,
 'title': parsed['title'][:100],
 'url': url,
 'raw_text': parsed['raw_text'][:100 0],
 'cleaned_text': cleaned_text,
 'tokens_count': token_count,
 'date': parsed['date']
 })
 parsed_urls.add(url)
 print(f" ✅ Документ {doc_id} добавлен: {token_count} токенов ({game})")
 doc_id += 1
 else:
 print(f" ⚠ Пропущен: слишком мало текста")
 else:
 print(f" ⚠ Пропущен: ошибка парсинга")
 time.sleep(random.uniform(0.5, 1.5))

 # СОХРАНЕНИЕ КОРПУСА В TXT
 print(f"\n💾 Создание корпуса: game_corpus_1000.txt")
 with open('game_corpus_1000.txt', 'w', encoding='utf-8') as f:
 for doc in corpus_data:
 f.write(f"=== Document {doc['doc_id']} | {doc['game']} | {doc['title']} | {doc['url']} ===\n")
 f.write(f"Tokens: {doc['tokens_count']} | Date: {doc['date']}\n")
 f.write(f"{doc['cleaned_text']}\n---\n")
 print(f" ✅ TXT-корпус сохранён: {len(corpus_data)} документов")

 # СОХРАНЕНИЕ В CSV
 df = pd.DataFrame(corpus_data)
 df.to_csv('game_corpus_1000.csv', index=False, encoding='utf-8')
 print(f" ✅ CSV сохранён: game_corpus_1000.csv")

 # СТАТИСТИКА
 total_tokens = sum(doc['tokens_count'] for doc in corpus_data)
 print(f"\n📊 Статистика корпуса:")
 print(f" Документов: {len(corpus_data)}")
 print(f" Токенов: {total_tokens}")
 print(f" Среднее токенов/документ: {total_tokens / len(corpus_data):.2f}" if corpus_data else " Нет данных")
 print(f" Уникальных URL: {len(parsed_urls)}")
 print(f"\n🎉 Корпус готов для NLP (LDA, кластеризация)!")

if __name__ == "__main__":
 main()