In [12]:
%reload_ext autoreload
%autoreload 2

In [13]:
import os, sys
import pandas as pd

from matplotlib import pyplot as plt
from nltk.corpus import stopwords
from wordcloud import WordCloud



In [14]:
# Add parent directory to path to import modules from src
rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)


In [15]:
from src.loader import NewsDataLoader
import src.utils as utils
if __name__ == "__main__":
    data_directory = "../data"
    loader = NewsDataLoader(data_directory)

    merge_df = loader.load_data()

ModuleNotFoundError: No module named 'loader'

In [None]:
#  Top and Bottom 10 Websites with the Largest Count of News Articles
top_10_websites = merge_df['source_name'].value_counts().head(10)
bottom_10_websites = merge_df['source_name'].value_counts().tail(10)

print("Top 10 Websites with Largest Count of News Articles:\n", top_10_websites)
print("\nBottom 10 Websites with Smallest Count of News Articles:\n", bottom_10_websites)

In [None]:

# Plotting
plt.figure(figsize=(10, 6))

# Plot for top 10 websites
plt.subplot(1, 2, 1)
top_10_websites.plot(kind='bar', color='skyblue')
plt.title('Top 10 Websites with Largest Count of News Articles')
plt.xlabel('Website')
plt.ylabel('Number of Articles')
plt.xticks(rotation=45, ha='right')

# Plot for bottom 10 websites
plt.subplot(1, 2, 2)
bottom_10_websites.plot(kind='bar', color='salmon')
plt.title('Bottom 10 Websites with Smallest Count of News Articles')
plt.xlabel('Website')
plt.ylabel('Number of Articles')
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.show()

In [None]:
# Websites with the Highest Numbers of Visitors Traffic

websites_traffic = merge_df.groupby('source_name')['GlobalRank'].max().nlargest(10)
print(merge_df.GlobalRank)
print("\nWebsites with Highest Numbers of Visitors Traffic:\n", websites_traffic)

In [None]:
#  Countries with the Highest Number of News Media Organisations
countries_media_organizations = merge_df['Country'].value_counts()
print("\nCountries with the Highest Number of News Media Organisations:")
print(countries_media_organizations)


In [None]:
# Countries that have many articles written about them
articles_about_countries = merge_df['Country'].value_counts()
print("\nCountries that have many articles written about them:", articles_about_countries)


In [None]:

#  Websites that reported about specific countries or regions
countries_of_interest = ['Africa', 'US', 'China', 'EU', 'Russia', 'Ukraine', 'Middle East']
websites_reporting_about_countries = merge_df[merge_df['Country'].isin(countries_of_interest)]

print("\nWebsites that reported about specific countries or regions:", websites_reporting_about_countries)


In [None]:

#Websites with the highest count of positive, neutral, and negative sentiment
sentiment_counts_by_website = merge_df.groupby('source_name')['title_sentiment'].value_counts()
print("\nWebsites with the highest count of positive, neutral, and negative sentiment:", sentiment_counts_by_website)


In [None]:
# Compare the impact of using mean/average and median

merge_df['title_sentiment'] = pd.to_numeric(merge_df['title_sentiment'], errors='coerce')

impact_mean_average_median = merge_df[['title_sentiment']].agg(['mean', 'median'])

print("\nImpact of using mean/average and median sentiment:", impact_mean_average_median)


In [None]:

# Check the distribution of sentiments for a particular domain
top_10_domains_by_traffic = merge_df.groupby('source_name')['GlobalRank'].max().nlargest(10)
sentiment_distribution_for_domain = merge_df[merge_df['source_name'].isin(top_10_domains_by_traffic.index)]['title_sentiment'].value_counts()

print("Sentiment distribution for a particular domain:")
print(sentiment_distribution_for_domain)

In [None]:
if not sentiment_distribution_for_domain.empty:

    plt.figure(figsize=(8, 6))
    sentiment_distribution_for_domain.plot(kind='bar', color='lightgreen')
    plt.title('Distribution of Sentiments for a Particular Domain')
    plt.xlabel('Sentiment')
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
else:
    print("Sentiment distribution data is empty. Unable to plot.")

In [None]:
# Compare content metadata across sites
content_metadata_comparison = merge_df.groupby('source_name')[['author', 'category']].nunique()
print("\nComparison of content metadata across sites:", content_metadata_comparison)


In [None]:
plt.figure(figsize=(10, 6))
content_metadata_comparison.plot(kind='bar')
plt.title('Comparison of Content Metadata Across Sites')
plt.xlabel('Website')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.legend(['Unique Authors', 'Unique Categories'])
plt.tight_layout()
plt.show()

In [None]:
# Analyze the similarity of message lengths across sites
message_lengths = merge_df.groupby('source_name')['content'].apply(lambda x: x.str.len().mean())
print("\nAnalysis of the similarity of message lengths across sites:", message_lengths)

In [None]:
plt.figure(figsize=(10, 6))
message_lengths.plot(kind='bar', color='skyblue')
plt.title('Analysis of Message Lengths Across Sites')
plt.xlabel('Website')
plt.ylabel('Average Message Length')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Analyze the similarity of the number of words in titles across sites
title_word_counts = merge_df.groupby('source_name')['title'].apply(lambda x: x.str.split().str.len().mean())
print("\nAnalysis of the similarity of the number of words in titles across sites:", title_word_counts)


In [None]:
plt.figure(figsize=(10, 6))
title_word_counts.plot(kind='bar', color='salmon')
plt.title('Analysis of Title Word Counts Across Sites')
plt.xlabel('Website')
plt.ylabel('Average Number of Words in Title')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Plotting
plt.figure(figsize=(10, 6))

# Plot the average number of words in titles
title_word_counts.plot(kind='bar', color='lightgreen')
plt.title('Average Number of Words in Titles Across Sites')
plt.xlabel('Website')
plt.ylabel('Average Number of Words')
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.show()

In [None]:

# Compare content metadata across sites
plt.figure(figsize=(10, 6))
content_metadata_comparison.plot(kind='bar', stacked=True)
plt.title('Comparison of Content Metadata Across Sites')
plt.xlabel('Website')
plt.ylabel('Number of Unique Items')
plt.xticks(rotation=45, ha='right')
plt.legend(['Authors', 'Categories'])
plt.tight_layout()
plt.show()


In [None]:
# Similarity of raw message lengths across sites
plt.figure(figsize=(10, 6))
merge_df.groupby('source_name')['content'].apply(lambda x: x.str.len()).hist(alpha=0.5, bins=20, density=True)
plt.title('Distribution of Message Lengths Across Sites')
plt.xlabel('Message Length')
plt.ylabel('Density')
plt.legend(merge_df['source_name'].unique())
plt.tight_layout()
plt.show()

In [None]:

# Similarity of number of words in titles across sites
plt.figure(figsize=(10, 6))
merge_df.groupby('source_name')['title'].apply(lambda x: x.str.split().str.len()).hist(alpha=0.5, bins=20, density=True)
plt.title('Distribution of Number of Words in Titles Across Sites')
plt.xlabel('Number of Words')
plt.ylabel('Density')
plt.legend(merge_df['source_name'].unique())
plt.tight_layout()
plt.show()

In [None]:

# Impact of frequent news reporting and sentiment on website’s global ranking
plt.figure(figsize=(10, 6))
plt.scatter(impact_news_reporting_sentiment['article_id'], impact_news_reporting_sentiment['GlobalRank'], c=impact_news_reporting_sentiment['title_sentiment'], cmap='coolwarm') # type: ignore
plt.title('Impact of News Reporting and Sentiment on Website Global Ranking')
plt.xlabel('Total Number of Reports by Website')
plt.ylabel('Global Ranking')
plt.colorbar(label='Sentiment (Average/Median)')
plt.tight_layout()
plt.show()