I - Packages Import


In [None]:
import ast
import os
import pandas as pd

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, pipeline
import torch

import matplotlib.pyplot as plt
import plotly.express as px


II - Data Verification / Preprocessing

This code verifies data format and time distribution of the articles

In [None]:
# Load the dataset
df = pd.read_csv('guardian_environment_news.csv')

# Inspect the first few rows of the dataset
print(df.head())


In [None]:
# Ensure 'Date Published' is in datetime format
df['Date Published'] = pd.to_datetime(df['Date Published'], errors='coerce')

# Check for invalid dates
if df['Date Published'].isnull().any():
    print("Warning: Some dates could not be parsed and have been set to NaT.")
# Select year 2023
df = df[df['Date Published'] >= '2023-01-01']
df = df[df['Date Published'] < '2024-01-01']
# Group by month and year
df['Year-Month'] = df['Date Published'].dt.to_period('M')  # Create 'Year-Month' column
monthly_counts = df['Year-Month'].value_counts().sort_index()

# Plot the aggregated distribution
plt.figure(figsize=(12, 6))
plt.bar(monthly_counts.index.astype(str), monthly_counts.values, color='skyblue')
plt.title("Monthly Article Publication Distribution", fontsize=16)
plt.xlabel("Month", fontsize=14)
plt.ylabel("Number of Articles", fontsize=14)
plt.xticks(rotation=45)
plt.grid(alpha=0.5, axis='y')
plt.tight_layout()
plt.show()


III - Features Extraction

This code uses different NLP pipeline to extract the different relevant features from the text.

In [None]:
# Initialize a named entity recognition (NER) pipeline using a pre-trained BERT model fine-tuned on CoNLL-2003 English data.
ner_pipeline = pipeline(
    'ner',  # Task: Named Entity Recognition
    model='dbmdz/bert-large-cased-finetuned-conll03-english',  # Pre-trained model
    tokenizer='dbmdz/bert-large-cased-finetuned-conll03-english'  # Corresponding tokenizer
)

# Function to extract named entities from a given text using the NER pipeline.
def get_named_entities(text):
    entities = ner_pipeline(text)  # Perform NER on the input text
    return entities

# Apply NER to each row of the DataFrame and store the results in a new column 'NER Location'.
df['NER Location'] = df.apply(
    lambda row: (
        print(f"Processing index: {row.name}") or  # Print the index being processed for tracking
        (get_named_entities(row['Article Text']) if pd.notna(row['Article Text']) else None)  # Perform NER if the text is not NaN
    ),
    axis=1  # Apply the function row-wise
)

# Save the updated DataFrame with the NER results to a CSV file
df.to_csv('sample_NER.csv', index=False)


In [None]:
# Initialize a sentiment analysis pipeline using a pre-trained DistilBERT model fine-tuned on SST-2 English data.
sent_classifier = pipeline(
    'sentiment-analysis',  # Task: Sentiment Analysis
    model='distilbert-base-uncased-finetuned-sst-2-english',  # Pre-trained model
    truncation=True  # Truncate input text to fit the model's input size
)

# Function to compute the sentiment score for a given text.
def get_sentiment_score(text):
    result = sent_classifier(text)  # Perform sentiment analysis on the input text
    score = result[0]['score']  # Extract the sentiment score
    return score

# Apply sentiment analysis to each row of the DataFrame and store the scores in a new column 'Sentiment Score'.
df['Sentiment Score'] = df.apply(
    lambda row: (
        print(f"Processing index: {row.name}") or  # Print the index being processed for tracking
        (get_sentiment_score(row['Article Text']) if pd.notna(row['Article Text']) else None)  # Compute sentiment if the text is not NaN
    ),
    axis=1  # Apply the function row-wise
)

# Save the updated DataFrame with sentiment scores to a CSV file named 'df_NER+sent.csv'.
df.to_csv('df_NER+sent.csv', index=False)


In [None]:
# Initialize an emotion classification pipeline using a pre-trained DistilRoBERTa model for emotion detection.
emo_classifier = pipeline(
    'text-classification',  # Task: Text Classification
    model='j-hartmann/emotion-english-distilroberta-base',  # Pre-trained model for emotion classification
    tokenizer='j-hartmann/emotion-english-distilroberta-base',  # Corresponding tokenizer
    truncation=True  # Truncate input text to fit the model's input size
)

# Function to compute emotion scores for a given text.
def get_emotion_score(article_text):
    result = emo_classifier(article_text)  # Perform emotion classification on the input text
    return result

# Apply emotion classification to each row of the DataFrame and store the results in a new column 'Emotion'.
df['Emotion'] = df.apply(
    lambda row: (
        print(f"Processing index: {row.name}") or  # Print the index being processed for tracking
        (get_emotion_score(row['Article Text']) if pd.notna(row['Article Text']) else None)  # Compute emotion if the text is not NaN
    ),
    axis=1  # Apply the function row-wise
)

# Function to extract the highest scoring emotion from the emotion data.
def extract_highest_emotion(emotion_data):
    try:
        # Convert the string representation of emotion data into a list of dictionaries.
        emotions = ast.literal_eval(emotion_data)

        # Find the emotion with the highest score and return its label.
        highest_emotion = max(emotions, key=lambda x: x['score'])['label']
        return highest_emotion
    except (ValueError, SyntaxError, TypeError) as e:
        # If there is an error (e.g., malformed string), return 'neutral' as the default.
        return 'neutral'

# Apply the function to the 'Emotion' column.
df['Emotion_Label'] = df['Emotion'].apply(extract_highest_emotion)

# Save the updated DataFrame with emotion classification results to a CSV file.
df.to_csv('df_NER+sent+emo.csv', index=False)


In [None]:
# List of country names used to filter the NER results.
country_list = [
    'Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia',
    'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium',
    'Belize', 'Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei', 'Bulgaria',
    'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada', 'Central African Republic', 'Chad',
    'Chile', 'China', 'Colombia', 'Comoros', 'Congo (Congo-Brazzaville)', 'Congo (Congo-Kinshasa)', 'Costa Rica',
    'Croatia', 'Cuba', 'Cyprus', 'Czechia (Czech Republic)', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic',
    'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Eswatini (fmr. "Swaziland")', 'England',
    'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Grenada', 'Great Britain',
    'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Honduras', 'Hungary', 'Iceland', 'India', 'Indonesia',
    'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Jordan', 'Kazakhstan', 'Kenya', 'Kiribati',
    'Korea, North', 'Korea, South', 'Kuwait', 'Kyrgyzstan', 'Laos', 'Latvia', 'Lebanon', 'Lesotho', 'Liberia', 'Libya',
    'Liechtenstein', 'Lithuania', 'Luxembourg', 'Madagascar', 'Malawi', 'Malaysia', 'Maldives', 'Mali', 'Malta', 'Marshall Islands',
    'Mauritania', 'Mauritius', 'Mexico', 'Micronesia', 'Moldova', 'Monaco', 'Mongolia', 'Montenegro', 'Morocco', 'Mozambique',
    'Myanmar (formerly Burma)', 'Namibia', 'Nauru', 'Nepal', 'Netherlands', 'New Zealand', 'Nicaragua', 'Niger', 'Nigeria',
    'North Macedonia', 'Norway', 'Oman', 'Pakistan', 'Palau', 'Panama', 'Papua New Guinea', 'Paraguay', 'Peru', 'Philippines',
    'Poland', 'Portugal', 'Qatar', 'Romania', 'Russia', 'Rwanda', 'Saint Kitts and Nevis', 'Saint Lucia', 'Saint Vincent and the Grenadines',
    'Samoa', 'San Marino', 'Sao Tome and Principe', 'Saudi Arabia', 'Senegal', 'Serbia', 'Seychelles', 'Sierra Leone', 'Singapore',
    'Slovakia', 'Slovenia', 'Solomon Islands', 'Somalia', 'South Africa', 'South Sudan', 'Spain', 'Sri Lanka', 'Sudan', 'Suriname',
    'Sweden', 'Switzerland', 'Syria', 'Taiwan', 'Tajikistan', 'Tanzania', 'Thailand', 'Timor-Leste', 'Togo', 'Tonga', 'Trinidad and Tobago',
    'Tunisia', 'Turkey', 'Turkmenistan', 'Tuvalu', 'Uganda', 'Ukraine', 'United Arab Emirates', 'United Kingdom','United-Kingdom','England', 'UK', 'United States of America', 'United States', 'USA',
    'Uruguay', 'Uzbekistan', 'Vanuatu', 'Vatican City', 'Venezuela', 'Vietnam', 'Yemen', 'Zambia', 'Zimbabwe'
]

# Function to extract countries from NER data based on the country list.
def extract_countries(ner_data, country_list):
    # Extract countries from NER results where the entity is identified as 'I-LOC' and the word matches a country in the country list.
    countries = [
        item['word'] for item in ner_data if item['entity'] == 'I-LOC' and item['word'] in country_list
    ]
    return list(set(countries))  # Use `set` to avoid duplicates and return unique country names

# Apply the function to the 'NER Location' column and store the extracted countries in a new column 'countries'.
df['countries'] = df['NER Location'].apply(lambda x: extract_countries(x, country_list))

# Save the updated DataFrame with extracted countries to a CSV file.
df.to_csv('df_NER+sent+emo+countries.csv', index=False)

In [None]:
df.to_csv('final_df.csv', index=False)

IV - Features Post-Processing

 This code splits the dataframe with month interval and full year interval in order to create a result dataframe containing the necessary data to plot the maps.
   

In [None]:
import pandas as pd
#df = pd.read_csv('final_df.csv')

# Filter data per publication date

df1 = df[df['Date Published'] >= '2023-01-01']
df1 = df1[df1['Date Published'] < '2023-02-01']

df2 = df[df['Date Published'] >= '2023-02-01']
df2 = df2[df2['Date Published'] < '2023-03-01']

df3 = df[df['Date Published'] >= '2023-03-01']
df3 = df3[df3['Date Published'] < '2023-04-01']

df4 = df[df['Date Published'] >= '2023-04-01']
df4 = df4[df4['Date Published'] < '2023-05-01']

df5 = df[df['Date Published'] >= '2023-05-01']
df5 = df5[df5['Date Published'] < '2023-06-01']

df6 = df[df['Date Published'] >= '2023-06-01']
df6 = df6[df6['Date Published'] < '2023-07-01']

df7 = df[df['Date Published'] >= '2023-07-01']
df7 = df7[df7['Date Published'] < '2023-08-01']

df8 = df[df['Date Published'] >= '2023-08-01']
df8 = df8[df8['Date Published'] < '2023-09-01']

df9 = df[df['Date Published'] >= '2023-09-01']
df9 = df9[df9['Date Published'] < '2023-10-01']

df10 = df[df['Date Published'] >= '2023-10-01']
df10 = df10[df10['Date Published'] < '2023-11-01']

df11 = df[df['Date Published'] >= '2023-11-01']
df11 = df11[df11['Date Published'] < '2023-12-01']

df12 = df[df['Date Published'] >= '2023-12-01']
df12 = df12[df12['Date Published'] < '2024-01-01']

df13 = df[df['Date Published'] >= '2023-01-01']
df13 = df13[df13['Date Published'] < '2024-01-01']
print(df13.shape)

# Create a list of dataframes for each time period
dfs= [df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12, df13]


In [None]:
import ast
import pandas as pd

def final_processing(dfs):
    for idx, df in enumerate(dfs):
        # Sample DataFrame (replace this with your actual data)
        df2 = df[['countries', 'Sentiment Score', 'Emotion_Label', 'topics']]  # Ensure 'Emotion_Label' and 'topics' columns exist

        # Step 1: Convert 'countries' column from string to list and remove redundancy
        df2['countries'] = df2['countries'].apply(lambda x: list(set(ast.literal_eval(x))) if isinstance(x, str) else x)

        # Step 2: Explode the 'countries' column into separate rows
        df2_exploded = df2.explode('countries').reset_index(drop=True)

        # -------------------- Topics Counting --------------------

        # Define the topics_to_keep list (replace with your actual topics)
        topics_to_keep = ['water', 'forest', 'energy', 'pollution', 'biodiversity']  # Replace with your topics

        # Step 3: Explode the 'topics' column into separate rows (like 'countries')
        df2_exploded['topics'] = df2_exploded['topics'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
        df2_exploded = df2_exploded.explode('topics').reset_index(drop=True)

        # Step 4: Create a column for each topic in topics_to_keep with a count of how many times each topic is mentioned per country
        for topic in topics_to_keep:
            df2_exploded[topic] = df2_exploded['topics'].apply(lambda x: 1 if x == topic else 0)

        # Step 5: Group by 'countries' and calculate the mean sentiment score
        result_topics = df2_exploded.groupby('countries', as_index=False)['Sentiment Score'].mean()

        # Step 6: Add the topic counts to the result_topics DataFrame
        result_topics[topics_to_keep] = df2_exploded.groupby('countries')[topics_to_keep].sum().reset_index(drop=True)

        # -------------------- Emotion Counting --------------------

        # Define the unique emotions (replace with actual emotions extracted from 'Emotion_Label' column)
        unique_emotions = ['disgust', 'anger', 'surprise', 'neutral', 'joy', 'sadness', 'fear']  # Replace with actual list of unique emotions

        # Step 7: Count the occurrences of each emotion for each country
        emotion_counts = df2_exploded.groupby(['countries', 'Emotion_Label']).size().unstack(fill_value=0)

        # Step 8: Find the emotion with the highest count for each country
        emotion_counts['Dominant_Emotion'] = emotion_counts.idxmax(axis=1)

        # Step 9: Create a DataFrame with the dominant emotion for each country
        result_emotions = emotion_counts[['Dominant_Emotion']].reset_index()

        # -------------------- Add Country Count --------------------

        country_count = df2.explode('countries')['countries'].value_counts().reset_index()
        country_count.columns = ['countries', 'Country_Count']

        # Merge the country count into result_emotions
        result_emotions = pd.merge(result_emotions, country_count, on='countries', how='left')

        # -------------------- Merge the Results --------------------

        # Merge the topic counts and emotion counts into one final DataFrame
        final_result = pd.merge(result_topics, result_emotions, on='countries', suffixes=('_topic', '_emotion'))
        # Save result dataframe as csv
        final_result.to_csv(f'final_result_{idx}.csv', index=False)

# Call function
final_processing(dfs)


V - Map Plotting

This code use the result dataframes to plot the maps for each feature and each time period.

In [None]:
!pip install -U kaleido

In [None]:

output_folder = "last_maps"

# Create output folder
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

def plot(idx):
    # Import specific time period result dataframe
    results = pd.read_csv(f'final_result_{idx}.csv')

    fig = px.choropleth(
        results,  # Data source for the map
        locations="countries",  # Column in DataFrame with country names
        locationmode='country names',  # Match locations based on country names
        color="water",  # Feature to use for color intensity on the map
        color_continuous_scale=px.colors.sequential.Plasma  # Color scale for the map
    )
    # Save the generated choropleth map as an image file.
    fig.write_image(os.path.join(output_folder, f"water_{idx}.png"))

    fig = px.choropleth(results, locations="countries",
                        locationmode='country names',
                        color="Sentiment Score",
                        color_continuous_scale=px.colors.sequential.Plasma)

    fig.write_image(os.path.join(output_folder, f"sent_{idx}.png"))


    fig = px.choropleth(results, locations="countries",
                        locationmode='country names',
                        color="forest",
                        color_continuous_scale=px.colors.sequential.Plasma)

    fig.write_image(os.path.join(output_folder, f"forest_{idx}.png"))


    fig = px.choropleth(results, locations="countries",
                        locationmode='country names',
                        color="energy",
                        color_continuous_scale=px.colors.sequential.Plasma)

    fig.write_image(os.path.join(output_folder, f"energy_{idx}.png"))


    fig = px.choropleth(results, locations="countries",
                        locationmode='country names',
                        color="biodiversity",
                        color_continuous_scale=px.colors.sequential.Plasma)

    fig.write_image(os.path.join(output_folder, f"biodiversity_{idx}.png"))

    fig = px.choropleth(results, locations="countries",
                        locationmode='country names',
                        color="Dominant_Emotion",
                        color_continuous_scale=px.colors.sequential.Plasma)

    fig.write_image(os.path.join(output_folder, f"emo_{idx}.png"))

    fig = px.choropleth(results, locations="countries",
                        locationmode='country names',
                        color="Country_Count",
                        color_continuous_scale=px.colors.sequential.Plasma)

    fig.write_image(os.path.join(output_folder, f"count_{idx}.png"))

#This loop will use the plot function on each month data and on full year data
for i in range(13):
  plot(i)