In [27]:
# Author: Antonio Drakes
# Date: 06 07 24 
# Description: Create bar chart counting frequency of enties, and bar chart counting frequency of names. 
#Struggles: For the entities it is a lot easier to differentiate, I found removing "the" from the beginning made it a lot easier. However, there may be issues like ANC and African National congress being counted differently. 
#For names, I have not attempted to consolidate any names as yet, eg (Zuma and Jacob Zuma)# For names, I have not attempted to consolidate any names as yet, eg (Zuma and Jacob Zuma)
# Future plans: use my bar chart to sort entities by people who are mentioned in them. 


In [28]:
#import data
import pandas as pd
import spacy
import matplotlib.pyplot as plt
from collections import Counter
import plotly.express as px


In [29]:
df = pd.read_excel(io ='/Users/antoniodrakes/Desktop/Voxcroft/df.xlsx')


In [30]:
df.head()

Unnamed: 0,Article_Date_Published,Article_Body,Article_Content_People_AI_Model,Article_Content_Entities_AI_Model
0,06/03/2024 14:38:46 +00:00,"McDonald’s, the multinational fast-food chain,...",,McDonald’s|the Sunday Times|R80|McDonald
1,06/03/2024 14:36:30 +00:00,Emtee recently opened up about reuniting with ...,Nicole Chinsamy|Nicole Chinsamy|Nicole Chinsam...,Briefly News|Briefly TV Life|Instagram\nEmtee|...
2,06/03/2024 14:36:17 +00:00,Provided by RAF Cash RAF Cash is a dynamic com...,Marc Andrew|Marc Andrew Marc Andrew|Marc,RAF Cash RAF Cash|the Road Acci|the National R...
3,06/03/2024 14:33:00 +00:00,Monday 14:13overcast cloudsTomorrow:light rain...,Cyril Ramaphosa|Nelson Mandela Bay|Ramaphosa|R...,Knysna-Plett Herald|FAIR|the Press Council of ...
4,06/03/2024 14:32:24 +00:00,Mzansi gospel singers will be hosting a prayer...,Zanele Mbokazi|Zanele Mbokazi|Zanele Mbokazi|Z...,Mzansi|Ukhozi|Crown Gospel Music Awards|Ukhozi...


In [31]:
# Extract names from the 'Article_Content_Entities_AI_Model' column
df['Article_Content_Entities_AI_Model'] = df['Article_Content_Entities_AI_Model'].fillna('')
all_entities = df['Article_Content_Entities_AI_Model'].str.cat(sep='|').split('|')


# Filter only names (PERSON entities) and remove "the" at the beginning
name_entities = [entity for entity in all_entities if ' ' in entity and not entity.startswith('the ')]

# Count the mentions of each name
name_counts = Counter(name_entities)

# Convert to a DataFrame for visualization
name_counts_df = pd.DataFrame(name_counts.items(), columns=['Name', 'Count']).sort_values(by='Count', ascending=False)

# Display the top 10 mentioned names
print(name_counts_df.head(10))

# Select the top 10 mentioned names for the bar chart
top_names_df = name_counts_df.head(10)

# Create an interactive bar chart with Plotly
fig = px.bar(top_names_df, x='Name', y='Count', color='Name', title='Top 10 Most Mentioned Names in Articles (Excluding "The" at Beginning)',
             labels={'Name': 'Names', 'Count': 'Mentions'},
             hover_data={'Name': True, 'Count': True})

# Update layout for better readability
fig.update_layout(xaxis_tickangle=-45)

# Show the interactive chart
fig.show()

                          Name  Count
0                 Briefly News   2360
83             Orlando Pirates   2244
73               Kaizer Chiefs   1782
20              Arena Holdings   1268
29                    MK Party   1187
1              Briefly TV Life   1053
9    Contact the Press Council    805
197                 City Power    580
235                  TS Galaxy    526
127  African National Congress    490






In [32]:
# Fill missing values with an empty string and convert column to string type
df['Article_Content_People_AI_Model'] = df['Article_Content_People_AI_Model'].fillna('').astype(str)

# Extract people names from the 'Article_Content_People_AI_Model' column
all_names = ' '.join(df['Article_Content_People_AI_Model']).split('|')

# Count the occurrences of each person name
name_counts = Counter(all_names)

# Convert to a DataFrame for visualization
name_counts_df = pd.DataFrame(name_counts.items(), columns=['Name', 'Count']).sort_values(by='Count', ascending=False)

# Select the top 10 mentioned names for the bar chart
top_names_df = name_counts_df.head(10)

# Create an interactive bar chart with Plotly
fig = px.bar(top_names_df, x='Name', y='Count',
             title='Top 10 Most Commonly Mentioned People Names in Articles',
             labels={'Name': 'Name', 'Count': 'Frequency'})

# Update layout for better readability
fig.update_layout(xaxis_tickangle=-45)

# Show the interactive chart
fig.show()

In [33]:
name_counts_df.head()

Unnamed: 0,Name,Count
38,Zuma,4208
12,Ramaphosa,3440
178,Cyril Ramaphosa,1862
207,Trump,1693
37,Jacob Zuma,1504


In [34]:

# Combine all names mentioned in the articles
all_names = df['Article_Content_People_AI_Model'].str.cat(sep='|').split('|')

# Filter and count only valid names (PERSON entities) and remove "the" at the beginning
name_entities = [name.strip() for name in all_names if ' ' in name and not name.lower().startswith('the ')]
name_counts = Counter(name_entities)

# Convert to a DataFrame and get the top 10 names
name_counts_df = pd.DataFrame(name_counts.items(), columns=['Name', 'Count']).sort_values(by='Count', ascending=False)
top_names = name_counts_df.head(10)['Name'].tolist()

# Initialize a list to hold the relationships
entity_people_pairs = []

# Iterate over each row to capture relationships involving the top 10 names
for index, row in df.iterrows():
    people = row['Article_Content_People_AI_Model'].split('|')
    filtered_people = [person.strip() for person in people if ' ' in person and not person.lower().startswith('the ')]
    for name in filtered_people:
        if name in top_names:
            for person in filtered_people:
                if person != name:
                    entity_people_pairs.append((name, person))

# Create a DataFrame from the relationships
relationships_df = pd.DataFrame(entity_people_pairs, columns=['Top_Name', 'Associated_Person'])

# Count the mentions of each top name and associated person pair
entity_person_counts = relationships_df.value_counts().reset_index(name='Count')

# Create an interactive bar chart with Plotly
fig = px.bar(entity_person_counts, x='Top_Name', y='Count', color='Associated_Person',
             title='Top 10 Most Mentioned Names and Associated People in Articles',
             labels={'Top_Name': 'Top Names', 'Count': 'Mentions'},
             hover_data={'Top_Name': True, 'Associated_Person': True, 'Count': True})

# Update layout for better readability
fig.update_layout(xaxis_tickangle=-45)

# Show the interactive chart
fig.show()





In [36]:
# Combine all entities mentioned in the articles
all_entities = df['Article_Content_Entities_AI_Model'].str.cat(sep='|').split('|')

# Filter and count valid entities, removing "the" at the beginning
entity_names = [entity.strip() for entity in all_entities if ' ' in entity and not entity.lower().startswith('the ')]
entity_counts = Counter(entity_names)

# Convert to a DataFrame and get the top 10 entities
entity_counts_df = pd.DataFrame(entity_counts.items(), columns=['Entity', 'Count']).sort_values(by='Count', ascending=False)
top_entities = entity_counts_df.head(10)['Entity'].tolist()

# Initialize a list to hold the relationships
entity_people_pairs = []

# Iterate over each row to capture relationships involving the top 10 entities
for index, row in df.iterrows():
    entities = row['Article_Content_Entities_AI_Model'].split('|')
    people = row['Article_Content_People_AI_Model'].split('|')
    
    # Clean up the lists
    filtered_entities = [entity.strip() for entity in entities if entity.strip() in top_entities]
    filtered_people = [person.strip() for person in people if person.strip() in top_names]

    
    for entity in filtered_entities:
        for person in filtered_people:
            entity_people_pairs.append((entity, person))

# Create a DataFrame from the relationships
relationships_df = pd.DataFrame(entity_people_pairs, columns=['Entity', 'Associated_Person'])

# Count the mentions of each entity and associated person pair
entity_person_counts = relationships_df.value_counts().reset_index(name='Count')

# Create an interactive bar chart with Plotly
fig = px.bar(entity_person_counts, x='Entity', y='Count', color='Associated_Person',
             title='Top 10 Most Mentioned Entities and Associated People in Articles',
             labels={'Entity': 'Entities', 'Count': 'Mentions'},
             hover_data={'Entity': True, 'Associated_Person': True, 'Count': True})

# Update layout for better readability
fig.update_layout(xaxis_tickangle=-45)

# Show the interactive chart
fig.show()





In [None]:
#top entities to top persons