In [None]:
import requests as rq
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
from nltk import download as nltk_download
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import nltk
import string
import spacy
from collections import Counter
from itertools import chain
import matplotlib.pyplot as plt
import re
import random
import seaborn as sns
import eng_spacysentiment

# Парсинг новостей по экологии

In [None]:
all_pages_climate = []
all_pages_pollution = []
all_pages_energy = []
all_pages_wildlife = []

base_url = "https://www.theguardian.com/environment/"

for i in range(1,16):
    all_pages_climate.append(f'{base_url}climate-crisis?page={i}')
for i in range(1,16):
    all_pages_pollution.append(f'{base_url}pollution?page={i}')
for i in range(1,16):
    all_pages_energy.append(f'{base_url}energy?page={i}')
for i in range(1,16):
    all_pages_wildlife.append(f'{base_url}wildlife?page={i}')
    

#Сбор ссылок на страницы сайта, содержащие ссылки на новости разделов (climate, pollution, energy, wildlife), 
#по 15 страниц на каждый раздел

In [None]:
def GetLinks(all_pages):
    """
    Сбор ссылок на новости в текстовом формате (то есть без ссылок на новоти в формате видео/аудио/фото/интерактивов)
    """
    
    all_links = []
    for page in all_pages:
        link = rq.get(page)
        soup = BeautifulSoup(link.text, features="html.parser") 
        for a in soup.find_all("a", class_="u-faux-block-link__overlay js-headline-text"):
            if 'video' not in a.get('href') and \
            'audio' not in a.get('href') and \
            'gallery' not in a.get('href') and \
            'commentisfree' not in a.get('href') and \
            'ng-interactive' not in a.get('href'):
                all_links.append(a.get('href'))
                
    return all_links

In [None]:
def GetNews(url):
    """
    Функция возвращает ссылку, теги, заголовок новости, текст новости
    """
    page = rq.get(url)
    soup = BeautifulSoup(page.text, features="html.parser")
    
    category = [] 
    for i in soup.find_all('a', class_= 'dcr-1gwziyt'):
        category.append(i.text)
    
    category = ', '.join(category)          
    title = soup.find('h1').text
    text = ''
    # На сайте постоянно обновляюься теги текста, в последний раз были следующие, но это было давно и сейчас они уже не работают
    if soup.find_all('p', class_='dcr-1lpi6p1'):
        text = [i.text for i in soup.find_all('p', class_='dcr-1lpi6p1')]
    if soup.find_all('p', class_= 'dcr-epamsi'):
        text = [i.text for i in soup.find_all('p', class_= 'dcr-epamsi')]
    if soup.find_all('p', class_= 'dcr-vq85ex'):
        text = [i.text for i in soup.find_all('p', class_= 'dcr-vq85ex')]
    if soup.find_all('p', class_= 'dcr-ppzeq1'):
        text = [i.text for i in soup.find_all('p', class_= 'dcr-ppzeq1')]
    if soup.find_all('p', class_= 'dcr-1fp5gi9'):
        text = [i.text for i in soup.find_all('p', class_= 'dcr-1fp5gi9')]
    if soup.find_all('p', class_= 'dcr-1yimvw'):
        text = [i.text for i in soup.find_all('p', class_= 'dcr-1yimvw')]    
    
    final_text = ' '.join(text)
    final_text = final_text.replace('\xa0', ' ').replace('\n', '').strip()
    
    return url, category, title, final_text

In [None]:
def get_all_news(all_links):
    list_of_news = []
    
    for link in tqdm(all_links):
        try:
            news = GetNews(link)
            list_of_news.append(news)
        except Exception as err:
            print(f'Failed: {link}, error: {err}')
            
    return list_of_news

In [None]:
all_links_climate = GetLinks(all_pages_climate)
all_links_pollution = GetLinks(all_pages_pollution)
all_links_energy = GetLinks(all_pages_energy)
all_links_wildlife = GetLinks(all_pages_wildlife)

In [None]:
all_links_climate[0:10]

In [None]:
news_climate = get_all_news(all_links_climate)
news_pollution = get_all_news(all_links_pollution)
news_energy = get_all_news(all_links_energy)
news_wildlife = get_all_news(all_links_wildlife)

In [None]:
print(f'No. of Climate News: {len(news_climate)}') 
print(f'No. of Pollution News: {len(news_pollution)}') 
print(f'No. of Energy News: {len(news_energy)}') 
print(f'No. of Wildlife News: {len(news_wildlife)}') 

In [None]:
columns = ['link', 'category', 'title', 'text']

df_climate = pd.DataFrame(news_climate, columns=columns)
df_pollution = pd.DataFrame(news_pollution, columns=columns)
df_energy = pd.DataFrame(news_energy, columns=columns)
df_wildlife = pd.DataFrame(news_wildlife, columns=columns)

In [None]:
df_environment = pd.concat([df_climate, df_pollution, df_energy, df_wildlife], ignore_index=True)
df_environment = df_environment.drop_duplicates().reset_index(drop=True)
df_environment = df_environment.dropna()

In [None]:
df_environment

---

# Препроцессинг данных и частотный анализ

In [None]:
nlp = spacy.load("en_core_web_lg")
nltk_download ('punkt')

In [None]:
with open('stops.txt') as file:
    lines = file.readlines()

stop_words = []

for line in lines:
    stop_words.append(line.strip()) 

In [None]:
punctuation = '!\"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~—»«...–'
filter = stop_words + list(punctuation)

In [None]:
full_texts = df_environment['text'].values.tolist()

In [None]:
corpus = random.choices(full_texts, k=600) # новостный текстов оказалось очень много, я решила взять 600 из них для этого проекта
combined_text = ''.join(list(chain.from_iterable(corpus)))

# в некоторых текстах были обнаружены вставки с призывом подписаться, я их убрала:
combined_text = combined_text.replace("Sign up for Guardian Australia’s free morning and afternoon email newsletters for your daily news roundup", "")
combined_text = combined_text.replace("Find more age of extinction coverage here, and follow biodiversity reporters Phoebe Weston and Patrick Greenfield on X for all the latest news and features", "")

In [None]:
quotation_marks = '“”'
combined_text = combined_text.translate(str.maketrans('', '', quotation_marks))

In [None]:
combined_text_2 = combined_text

# Некоторые предложения после парсинга слиплись, тут я их разделяю:
for combined in re.findall(r"([a-z]+\.[A-Z]+)", combined_text):
    correction = ". ".join(combined.split("."))
    combined_text_2 = combined_text_2.replace(combined, correction)

In [None]:
text_sentences = sent_tokenize(combined_text_2) # разбиваю корпус на отдельные предложения

In [None]:
def preprocess_sentwise(input_sent):
    """
    функция для предобработки текста, предварительно разбитого на предложения
    """
    normalized = input_sent.lower()
    doc = nlp(normalized)
    words_lemmas = [token.lemma_ for token in doc]
    no_stop_sent = [word for word in words_lemmas if word not in filter and word.isalpha()]
    clean_sent = ' '.join(no_stop_sent)

    return clean_sent

In [None]:
list_orig_and_lemma = []

for sent in tqdm(text_sentences):
    lemma_sent = preprocess_sentwise(sent)
    list_orig_and_lemma.append((sent, lemma_sent))

In [None]:
df_orig_and_lemma = pd.DataFrame(list_orig_and_lemma, columns=['original sentence', 'preprocessed sentence'])
df_orig_and_lemma

In [None]:
words = []
for sent in df_orig_and_lemma['original sentence'].values:
    words.append(word_tokenize(sent))
words_list = list(chain.from_iterable(words)) 

words_list_no_punkt = []
for word in words_list:
    if word not in punctuation: 
        words_list_no_punkt.append(word) 
        
len(words_list_no_punkt) # размер корпуса (количество слов в корпусе до удаления стоп-слов, без знаков пунктуации)     

In [None]:
tokens = []
for sent in df_orig_and_lemma['preprocessed sentence'].values:
    tokens.append(word_tokenize(sent))
tokens_list = list(chain.from_iterable(tokens)) # токены в корпусе после препроцессинга

In [None]:
word_freqs = Counter(tokens_list) 
print(word_freqs.most_common(100))

In [None]:
labels_1 = [element[0] for element in word_freqs.most_common(50)]
counts_1 = [element[1] for element in word_freqs.most_common(50)]
fig, ax = plt.subplots(figsize=(15, 5))
plot = sns.barplot(x=labels_1, y=counts_1, ax=ax)
ax.tick_params(labelrotation=90)
ax.set_title("Самые частотные униграммы в корпусе")
ax.set_ylabel("Частота")
ax.set_xlabel("Биграммы")

plt.show()

In [None]:
freq_bigramms = Counter(nltk.bigrams(tokens_list))
freq_bigramms.most_common(30)

In [None]:
labels_2 = [' '.join(element[0]) for element in freq_bigramms.most_common(30)]
counts_2 = [element[1] for element in freq_bigramms.most_common(30)]
fig, ax = plt.subplots(figsize=(15, 5))
plot = sns.barplot(x=labels_2, y=counts_2, ax=ax)
ax.set_title("Самые частотные биграммы в корпусе")
ax.set_ylabel("Частота")
ax.set_xlabel("Биграммы")
ax.tick_params(labelrotation=90)

plt.show()

# Анализ тональности

In [None]:
vocab_uni = []
vocab_bi = []

for word in word_freqs.most_common(100):
    vocab_uni.append(word)
for word in freq_bigramms.most_common(30):
    vocab_bi.append(word)

concatenated_vocab = vocab_uni + vocab_bi

In [None]:
nlp_sentiment = eng_spacysentiment.load()

In [None]:
sentiment_vocab = []

for word in concatenated_vocab:
    if len(word[0]) != 2:
        keyword = word[0]
    else:
        keyword = ' '.join(word[0])
    sents = df_orig_and_lemma[df_orig_and_lemma['preprocessed sentence'].str.contains(keyword)].values.tolist()

    count_pos = 0
    count_neg = 0

    for org_sent, prep_sent in sents:
        doc = nlp_sentiment(prep_sent)
        sentiment = doc.cats
    
        if sentiment['positive'] > sentiment['negative']:
            count_pos += 1
        elif sentiment['negative'] > sentiment['positive']:
            count_neg += 1
    sentiment_vocab.append((keyword, word[1], count_pos, count_neg))

vocab = pd.DataFrame(sentiment_vocab, columns=['keyword', 'count', 'count_pos', 'count_neg'])
vocab

#vocab.to_excel("vocab_trial.xlsx")

In [None]:
keyword = "pollution"

sents = df_orig_and_lemma[df_orig_and_lemma['preprocessed sentence'].str.contains(keyword)].values.tolist()
list_pos = []
list_neg = []

for org_sent, prep_sent in sents:
    doc = nlp_sentiment(prep_sent)
    sentiment = doc.cats
    
    if sentiment['positive'] > sentiment['negative']:
        list_pos.append((org_sent, sentiment['positive']))
    elif sentiment['negative'] > sentiment['positive']:
        list_neg.append((org_sent, sentiment['negative'])) 

In [None]:
# Вывод предложений, содержащих ключевое слово, для проверки результатов анализа тональности вручную, чтобы исправить данные в таблице Еxcel 

In [None]:
list_pos

In [None]:
list_neg

In [None]:
# Последующая работа велась в таблице Еxcel: 
# Там я считала PMI, T-score, хи-квадрат по формулам для отобранных ключевых слов
# В ходе работы некоторые изначально отобранные частотные униграммы и биграммы были удалены, и добавлены другие менее частотные, но более релевантные