PART II

In [4]:
import  pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from collections import Counter
from spacy.lang.en import English 

In [6]:
# Load the DataFrames
traffic_df = pd.read_csv('/Users/user/Desktop/martin_1/archive/traffic.csv')
data_df = pd.read_csv('/Users/user/Desktop/martin_1/archive/rating.csv')
domains_df = pd.read_csv('/Users/user/Desktop/martin_1/archive/domains_location.csv')
d_df = pd.read_csv('/Users/user/Desktop/martin_1/archive/data.csv')



In [None]:

# Step 1: Top and bottom 10 websites with the largest count of news articles
top_10_websites = data_df['source_name'].value_counts().nlargest(10)
bottom_10_websites = data_df['source_name'].value_counts().nsmallest(10)

# Step 2: Websites with the highest numbers of visitor traffic
traffic_df = domains_df.groupby('source_name')['traffic'].sum().sort_values(ascending=False)

# Step 3: Countries with the highest number of news media organizations
countries_news_org = domains_df['country'].value_counts()

# Step 4: Countries with many articles written about them
articles_per_country = data_df['country'].value_counts()

# Step 5: Websites that reported news about specific regions
region_websites = {
    'Africa': ['Africa'],
    'US': ['United States'],
    'China': ['China'],
    'EU': ['European Union', 'European Commission'],
    'Russia': ['Russia'],
    'Ukraine': ['Ukraine'],
    'Middle East': ['Middle East', 'Middle Eastern']
}

region_counts = {}
for region, countries in region_websites.items():
    region_counts[region] = data_df[data_df['country'].isin(countries)]['source_name'].nunique()

# Step 6: Websites with the highest count of positive, neutral, and negative sentiment
sentiment_counts = data_df.groupby('source_name')['sentiment'].value_counts().unstack(fill_value=0)
sentiment_counts['Total'] = sentiment_counts.sum(axis=1)

# Step 7: Compare the impact of using mean/average and median
sentiment_summary = data_df.groupby('source_name')['sentiment'].agg(['mean', 'median'])

# Step 8: Distribution of sentiments for top 10 domains by visitor traffic
top_10_traffic_domains = traffic_df.head(10).index
top_10_sentiment_distribution = data_df[data_df['source_name'].isin(top_10_traffic_domains)]['sentiment'].value_counts(normalize=True)

# Visualization
plt.figure(figsize=(10, 6))
sns.barplot(x=top_10_websites.values, y=top_10_websites.index)
plt.title('Top 10 Websites with Largest Count of News Articles')
plt.xlabel('Number of Articles')
plt.ylabel('Website')
plt.show()

plt.figure(figsize=(10, 6))
sns.barplot(x=traffic_df.head(10).values, y=traffic_df.head(10).index)
plt.title('Websites with Highest Visitor Traffic')
plt.xlabel('Visitor Traffic')
plt.ylabel('Website')
plt.show()

plt.figure(figsize=(10, 6))
countries_news_org.plot(kind='bar')
plt.title('Countries with Highest Number of News Media Organizations')
plt.xlabel('Country')
plt.ylabel('Number of News Media Organizations')
plt.show()

plt.figure(figsize=(10, 6))
articles_per_country.plot(kind='bar')
plt.title('Countries with Many Articles Written About Them')
plt.xlabel('Country')
plt.ylabel('Number of Articles')
plt.show()

plt.figure(figsize=(10, 6))
sns.barplot(x=list(region_counts.values()), y=list(region_counts.keys()))
plt.title('Websites Reporting News About Specific Regions')
plt.xlabel('Number of Websites')
plt.ylabel('Region')
plt.show()

sentiment_counts.plot(kind='bar', stacked=True, figsize=(12, 6))
plt.title('Sentiment Distribution by Website')
plt.xlabel('Website')
plt.ylabel('Count')
plt.legend(title='Sentiment')
plt.show()

sentiment_summary.plot(kind='bar', figsize=(12, 6))
plt.title('Comparison of Mean and Median Sentiment')
plt.xlabel('Website')
plt.ylabel('Sentiment')
plt.legend(title='Statistic')
plt.show()

top_10_sentiment_distribution.plot(kind='bar', figsize=(10, 6))
plt.title('Sentiment Distribution for Top 10 Traffic Domains')
plt.xlabel('Sentiment')
plt.ylabel('Proportion')
plt.show()

In [None]:
#Top and bottom 10 websites with the largest count of news articles
top_10_websites = data_df['source_name'].value_counts().nlargest(10)
plt.figure(figsize=(10, 6))
sns.barplot(x=top_10_websites.values, y=top_10_websites.index)
plt.title('Top 10 Websites with Largest Count of News Articles')
plt.xlabel('Number of Articles')
plt.ylabel('Website')
plt.show()

In [None]:
data_df.info()

In [None]:
# bottom 10 websites with the largest count of news articles
bottom_10_websites = data_df['source_name'].value_counts().nsmallest(10)
plt.figure(figsize=(10, 6))
plt.subplot(1, 2, 2)
sns.barplot(x=bottom_10_websites.values, y=bottom_10_websites.index,palette='Blues',legend=False)
plt.title('Top 10 Websites with Smallest Count of News Articles')
plt.xlabel('Number of Articles')
plt.ylabel('Website')
plt.show()

In [None]:
# bottom 10 websites with the largest count of news articles
website_count=data_df['source_name'].value_counts().reset_index()
website_count.columns=["source_name","article_count"]

bottom_10=website_count.sort_values(by='article_count',ascending=False ).tail(10)
print(bottom_10[['source_name','article_count']])

In [None]:
plt.subplot(1, 2, 2)
sns.barplot(x='article_count', y='source_name',  data=bottom_10, palette='Blues', hue='source_name' ,legend=False)
plt.title('Bottom 10 websites with smallest count of news articles')
plt.xlabel('Number of artciles')
plt.ylabel('Website')

plt.tight_layout()
plt.show()

In [None]:
traffic_df.dropna(subset=['Domain', 'GlobalRank'], inplace=True)

top_traffic_websites = traffic_df.sort_values(by='GlobalRank').head(10)

print(top_traffic_websites[['Domain', 'GlobalRank']])

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='GlobalRank', y='Domain', data=top_traffic_websites, palette='Blues_d')

plt.xlabel('Global Rank')
plt.ylabel('Website')
plt.title('Top 10 Websites with the Highest Traffic')
plt.show()

In [None]:
#Countries with the highest number of news media organisations (represented by domains in the data)

domains_df.dropna(subset=['Country', 'SourceCommonName'], inplace=True)
country_counts = domains_df['Country'].value_counts()
top_countries = country_counts.head(10)

# Plotting
plt.figure(figsize=(10, 6))
top_countries.plot(kind='bar', color='red')
plt.xlabel('Country')
plt.ylabel('Number of News Media Organizations')
plt.title('Top Countries with the Highest Number of News Media Organizations')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Filter articles to include only those that mention countries
articles_with_countries = domains_df.dropna(subset=['Country'])


In [None]:
# Split the 'countries' column into a list of countries
articles_with_countries['Country'] = articles_with_countries['Country'].apply(lambda x: x.split(','))

In [None]:
# Create a list of all mentioned countries
all_countries = [country.strip() for sublist in articles_with_countries['Country'] for country in sublist]

In [None]:
# Count the occurrences of each country
country_counts = pd.Series(all_countries).value_counts()

In [None]:
# Plotting
plt.figure(figsize=(10, 6))
country_counts.head(10).plot(kind='bar', color='skyblue')  # Plot top 10 countries
plt.xlabel('Country')
plt.ylabel('Number of Articles')
plt.title('Countries with the Highest Number of Articles Mentioning Them')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
data_df.columns

In [None]:

# Define dictionaries mapping countries to their respective regions
african_countries = ['Algeria', 'Angola', 'Benin', 'Botswana', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cameroon', 'Central African Republic', 'Chad', 'Comoros', 'Democratic Republic of the Congo', 'Djibouti', 'Egypt', 'Equatorial Guinea', 'Eritrea', 'Eswatini', 'Ethiopia', 'Gabon', 'Gambia', 'Ghana', 'Guinea', 'Guinea-Bissau', 'Ivory Coast', 'Kenya', 'Lesotho', 'Liberia', 'Libya', 'Madagascar', 'Malawi', 'Mali', 'Mauritania', 'Mauritius', 'Morocco', 'Mozambique', 'Namibia', 'Niger', 'Nigeria', 'Republic of the Congo', 'Rwanda', 'Sao Tome and Principe', 'Senegal', 'Seychelles', 'Sierra Leone', 'Somalia', 'South Africa', 'South Sudan', 'Sudan', 'Tanzania', 'Togo', 'Tunisia', 'Uganda', 'Zambia', 'Zimbabwe']
eu_countries = ['Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Ireland', 'Italy', 'Latvia', 'Lithuania', 'Luxembourg', 'Malta', 'Netherlands', 'Poland', 'Portugal', 'Romania', 'Slovakia', 'Slovenia', 'Spain', 'Sweden']
middle_east_countries = ['Bahrain', 'Iran', 'Iraq', 'Israel', 'Jordan', 'Kuwait', 'Lebanon', 'Oman', 'Palestine', 'Qatar', 'Saudi Arabia', 'Syria', 'Turkey', 'United Arab Emirates', 'Yemen']
us_countries = ['United States', 'USA', 'U.S.']
china_countries = ['China']

# Combine the regions and their respective countries into a dictionary
region_countries_dict = {
    'Africa': african_countries,
    'EU': eu_countries,
    'Middle East': middle_east_countries,
    'US': us_countries,
    'China': china_countries
}

# Filter articles containing mentions of the specified regions
articles_about_regions = data_df[data_df['content'].str.contains('|'.join(sum(region_countries_dict.values(), [])))]

# Get the websites that reported news content about the specified regions
websites_reporting_about_regions = articles_about_regions['source_name'].unique()

print("Websites reporting news content about the specified regions:")
print(websites_reporting_about_regions)


In [None]:


# Define dictionaries mapping countries to their respective regions
african_countries = ['Algeria', 'Angola', 'Benin', 'Botswana', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cameroon', 'Central African Republic', 'Chad', 'Comoros', 'Democratic Republic of the Congo', 'Djibouti', 'Egypt', 'Equatorial Guinea', 'Eritrea', 'Eswatini', 'Ethiopia', 'Gabon', 'Gambia', 'Ghana', 'Guinea', 'Guinea-Bissau', 'Ivory Coast', 'Kenya', 'Lesotho', 'Liberia', 'Libya', 'Madagascar', 'Malawi', 'Mali', 'Mauritania', 'Mauritius', 'Morocco', 'Mozambique', 'Namibia', 'Niger', 'Nigeria', 'Republic of the Congo', 'Rwanda', 'Sao Tome and Principe', 'Senegal', 'Seychelles', 'Sierra Leone', 'Somalia', 'South Africa', 'South Sudan', 'Sudan', 'Tanzania', 'Togo', 'Tunisia', 'Uganda', 'Zambia', 'Zimbabwe']
eu_countries = ['Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Ireland', 'Italy', 'Latvia', 'Lithuania', 'Luxembourg', 'Malta', 'Netherlands', 'Poland', 'Portugal', 'Romania', 'Slovakia', 'Slovenia', 'Spain', 'Sweden']
middle_east_countries = ['Bahrain', 'Iran', 'Iraq', 'Israel', 'Jordan', 'Kuwait', 'Lebanon', 'Oman', 'Palestine', 'Qatar', 'Saudi Arabia', 'Syria', 'Turkey', 'United Arab Emirates', 'Yemen']
us_countries = ['United States', 'USA', 'U.S.']
china_countries = ['China']

# Combine the regions and their respective countries into a dictionary
region_countries_dict = {
    'Africa': african_countries,
    'EU': eu_countries,
    'Middle East': middle_east_countries,
    'US': us_countries,
    'China': china_countries
}

# Filter articles containing mentions of the specified regions
articles_about_regions = data_df[data_df['content'].str.contains('|'.join(sum(region_countries_dict.values(), [])))]

# Get the source names that reported news content about the specified regions
sources_reporting_about_regions = articles_about_regions['source_name']

# Count the occurrences of each source
source_counts = sources_reporting_about_regions.value_counts()

# Plotting
plt.figure(figsize=(10, 6))
source_counts.head(10).plot(kind='bar', color='skyblue')  # Plot top 10 sources
plt.xlabel('Source Name')
plt.ylabel('Number of Articles')
plt.title('Top Sources Reporting News Content about Specified Regions')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [86]:
domains_df.columns


Index(['SourceCommonName', 'location', 'Country'], dtype='object')

In [88]:
traffic_df.columns

Index(['GlobalRank', 'TldRank', 'Domain', 'TLD', 'RefSubNets', 'RefIPs',
       'IDN_Domain', 'IDN_TLD', 'PrevGlobalRank', 'PrevTldRank',
       'PrevRefSubNets', 'PrevRefIPs'],
      dtype='object')

In [89]:
data_df.columns

Index(['article_id', 'source_id', 'source_name', 'author', 'title',
       'description', 'url', 'url_to_image', 'published_at', 'content',
       'category', 'article', 'title_sentiment'],
      dtype='object')

In [100]:
print(data_df['title_sentiment'].dtype)

object


In [101]:
data_df['title_sentiment'] = pd.to_numeric(data_df['title_sentiment'], errors='coerce')  # Coerce errors to NaN


In [102]:
website_sentiment = data_df.groupby('source_name')['title_sentiment'].agg(['mean', 'median', 'var']).reset_index()


In [103]:
# Print results (showing websites with high/low sentiment can be achieved through sorting)
print("Website Sentiment Statistics (Mean, Median, Variance):")
print(website_sentiment.to_string(index=True))

Website Sentiment Statistics (Mean, Median, Variance):
                     source_name  mean  median  var
0                       ABC News   NaN     NaN  NaN
1             Al Jazeera English   NaN     NaN  NaN
2    AllAfrica - Top Africa News   NaN     NaN  NaN
3                Android Central   NaN     NaN  NaN
4                       BBC News   NaN     NaN  NaN
5                    Boing Boing   NaN     NaN  NaN
6               Business Insider   NaN     NaN  NaN
7                            CNA   NaN     NaN  NaN
8                            CNN   NaN     NaN  NaN
9                       Deadline   NaN     NaN  NaN
10                Digital Trends   NaN     NaN  NaN
11                ETF Daily News   NaN     NaN  NaN
12                      Euronews   NaN     NaN  NaN
13                        Forbes   NaN     NaN  NaN
14                   Gizmodo.com   NaN     NaN  NaN
15            Globalsecurity.org   NaN     NaN  NaN
16                 GlobeNewswire   NaN     NaN  NaN
17  Inter

In [104]:
# Sort websites by sentiment metric (e.g., mean sentiment)
website_sentiment_sorted_mean = website_sentiment.sort_values(by='mean', ascending=False)  # Sort by descending mean

In [105]:

# Print top/bottom 5 websites by mean sentiment (modify as needed)
print("\nTop 5 Websites by Mean Sentiment (Descending):")
print(website_sentiment_sorted_mean.head(5))

print("\nBottom 5 Websites by Mean Sentiment (Ascending):")
print(website_sentiment_sorted_mean.tail(5))


Top 5 Websites by Mean Sentiment (Descending):
                   source_name  mean  median  var
0                     ABC News   NaN     NaN  NaN
1           Al Jazeera English   NaN     NaN  NaN
2  AllAfrica - Top Africa News   NaN     NaN  NaN
3              Android Central   NaN     NaN  NaN
4                     BBC News   NaN     NaN  NaN

Bottom 5 Websites by Mean Sentiment (Ascending):
           source_name  mean  median  var
24           The Punch   NaN     NaN  NaN
25  The Times of India   NaN     NaN  NaN
26           The Verge   NaN     NaN  NaN
27                Time   NaN     NaN  NaN
28               Wired   NaN     NaN  NaN


In [5]:


# Load data (assuming 'source_name' is website domain and 'title_sentiment' is sentiment score)

# Group by website and calculate descriptive statistics for sentiment
website_sentiment = data_df.groupby('source_name')['title_sentiment'].agg(['mean', 'median', 'var']).reset_index()

# Print results (showing websites with high/low sentiment can be achieved through sorting)
print("Website Sentiment Statistics (Mean, Median, Variance):")
print(website_sentiment.to_string(index=True))

# Sort websites by sentiment metric (e.g., mean sentiment)
website_sentiment_sorted_mean = website_sentiment.sort_values(by='mean', ascending=False)  # Sort by descending mean

# Print top/bottom 5 websites by mean sentiment (modify as needed)
print("\nTop 5 Websites by Mean Sentiment (Descending):")
print(website_sentiment_sorted_mean.head(5))

print("\nBottom 5 Websites by Mean Sentiment (Ascending):")
print(website_sentiment_sorted_mean.tail(5))




import pandas as pd
import matplotlib.pyplot as plt

# Load data (assuming 'source_name' is website domain and 'title_sentiment' is sentiment score)
df_data = pd.read_csv("data.csv")
df_traffic = pd.read_csv("traffic_data.csv")  # Assuming traffic data

# Merge data (optional, if needed to join with traffic data)
# df_merged = df_data.merge(df_traffic, on='source_name', how='left')  # Left join to keep all data from data.csv

# Group by website and calculate statistics
website_sentiment = df_data.groupby('source_name')['title_sentiment'].agg(['mean', 'median', 'var']).reset_index()

# Top 10 websites by traffic (optional, if traffic data is available)
top_10_traffic = df_traffic.sort_values(by='GlobalRank').head(10)['source_name'].tolist()
website_sentiment_filtered = website_sentiment[website_sentiment['source_name'].isin(top_10_traffic)]  # Filter by top 10

# Print results (mean vs. median comparison)
print("Mean vs. Median Sentiment Comparison:")
for index, row in website_sentiment_filtered.iterrows():
  website_name = row['source_name']
  mean_sentiment = row['mean']
  median_sentiment = row['median']
  print(f"\nWebsite: {website_name}")
  print(f"Mean Sentiment: {mean_sentiment}")
  print(f"Median Sentiment: {median_sentiment}")

# Sentiment distribution for top websites (histograms)
for index, row in website_sentiment_filtered.iterrows():
  website_name = row['source_name']
  website_data = df_data[df_data['source_name'] == website_name]
  plt.figure(figsize=(6, 4))
  plt.hist(website_data['title_sentiment'], bins=10, edgecolor='black', label=website_name)
  plt.xlabel('Sentiment Score')
  plt.ylabel('Number of Articles')
  plt.title(f"Sentiment Distribution for {website_name}")
  plt.grid(True)
  plt.legend()
  plt.tight_layout()
  plt.show()



NameError: name 'data_df' is not defined

In [8]:


# Assuming 'data' is a DataFrame with columns: 'website', 'sentiment'

# Group data by website domain and calculate statistics
df_filtered = data_df[data_df['title_sentiment'].notna()]
domain_sentiment = df_filtered.groupby('source_name')['title_sentiment'].agg(['mean', 'median', 'var'])

# Print descriptive statistics and analyze sentiment distribution for top 10 domains
print(domain_sentiment.describe())

# Sort by website visitors (replace with your data source)
top_10_domains = pd.read_csv('/Users/user/Desktop/martin_1/archive/domains_location.csv')  # Replace with your data source
top_10_domains = top_10_domains.head(10)['site']  # Get top 10 domains

# Filter data for top 10 domains
top_10_data = data_df[data_df['website'].isin(top_10_domains)]

# Analyze sentiment distribution for each domain (replace with your data source for global sentiment)
for domain in top_10_domains:
  domain_data = top_10_data[top_10_data['website'] == domain]
  sentiment_counts = domain_data['sentiment'].value_counts().sort_values(ascending=False)
  print(f"Sentiment Distribution for {domain}:")
  print(sentiment_counts)
  # Add code here to compare with global sentiment distribution (data source needed)


TypeError: agg function failed [how->mean,dtype->object]