In [4]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Load the dataset
data = pd.read_csv('cleandata_processed.csv', nrows=500)

# Define a simple stopwords list
simple_stopwords = set([
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
    'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself',
    'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
    'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be',
    'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an',
    'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by',
    'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before',
    'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over',
    'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
    'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no',
    'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will',
    'just', 'don', 'should', 'now'
])

# Preprocessing function without NLTK
def preprocess_text_simple(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'\W+', ' ', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    words = text.split()
    words = [word for word in words if word not in simple_stopwords]  # Remove stopwords
    return ' '.join(words)

# Apply preprocessing to the Article_Body column
data['Processed_Article_Body'] = data['Article_Body'].apply(preprocess_text_simple)

# Initialize the TF-IDF vectorizer with a maximum number of features
vectorizer = TfidfVectorizer(max_features=1000)

# Fit and transform the processed article body text
tfidf_matrix = vectorizer.fit_transform(data['Processed_Article_Body'])

# Convert the TF-IDF matrix to a DataFrame for better readability
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Save the TF-IDF DataFrame to a CSV file
tfidf_df.to_csv('tfidf_analysis_results.csv', index=False)

# Function to get the top N terms with the highest TF-IDF scores
def get_top_n_terms(tfidf_matrix, feature_names, top_n=10):
    summed_tfidf = tfidf_matrix.sum(axis=0)
    sorted_indices = np.argsort(summed_tfidf)[::-1][:top_n]
    top_terms = [(feature_names[i], summed_tfidf[0, i]) for i in sorted_indices]
    return top_terms

# Get the feature names
feature_names = vectorizer.get_feature_names_out()

# Get the top 20 terms across all documents
top_terms = get_top_n_terms(tfidf_matrix, feature_names, top_n=20)

# Print the top terms
for term, score in top_terms:
     print(f"Term: {term}, Score: {score[0,0]:.4f}")


Term: [['moment' 'unlikely' 'blinken' 'allies' 'matters' 'movement' 'half'
  'otherwise' 'ukrainian' 'speaking' 'reach' 'switzerland' 'else' 'seems'
  'conflict' 'concerns' 'governance' 'simply' 'territory' 'wing' 'drop'
  'claim' 'courts' 'union' 'attacks' 'happened' 'france' 'destruction'
  'likely' 'aimed' 'trip' 'towards' 'subkh' 'patriotic' 'special'
  'candidates' 'phase' 'knee' 'hands' 'avoid' 'opposition' 'best' 'felt'
  'communist' 'noted' 'title' 'job' 'formed' 'evening' 'early'
  'totalenergies' 'calling' 'forwards' 'versfeld' 'partner'
  'consequences' 'mayor' 'anti' 'russian' 'means' 'furthermore' 'keep'
  'double' 'der' 'goes' 'door' 'zelensky' 'sidelined' 'option' 'share'
  'research' 'mean' 'ready' 'rights' 'served' 'looking' 'required' 'meet'
  'everyone' 'chance' 'hold' 'macron' 'kzn' 'fighters' 'name' 'approach'
  'nadal' 'represent' 'true' 'electricity' 'zealand' 'leadership' 'feel'
  'leinster' 'spot' 'sentences' 'repeatedly' 'great' 'ambassador'
  'particularly' '

In [5]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import numpy as np

# Load the pre-trained spaCy model
nlp = spacy.load("en_core_web_sm")

# Load your dataset
new_data = pd.read_csv('cleandata_processed.csv', nrows = 500)  # Ensure 'data_processed.csv' is in the same directory

# Combine relevant columns
new_data['Combined_Names'] = new_data['Name'].fillna('') + ' ' + new_data['Article_Themes_AI_Model'].fillna('')

# Preprocess text
def preprocess_text_simple(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'\W+', ' ', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    words = text.split()
    words = [word for word in words if word not in simple_stopwords]  # Remove stopwords
    return ' '.join(words)

new_data['Processed_Names'] = new_data['Combined_Names'].apply(preprocess_text_simple)

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=1000)
tfidf_matrix = vectorizer.fit_transform(new_data['Processed_Names'])
feature_names = vectorizer.get_feature_names_out()

# Get top N terms
def get_top_n_terms(tfidf_matrix, feature_names, top_n=10):
    summed_tfidf = tfidf_matrix.sum(axis=0)
    sorted_indices = np.argsort(summed_tfidf)[::-1][:top_n]
    top_terms = [(feature_names[i], summed_tfidf[0, i]) for i in sorted_indices]
    return top_terms

top_terms = get_top_n_terms(tfidf_matrix, feature_names, top_n=20)
print("Top TF-IDF Terms:", top_terms)

# Analyze context for "Biden"
for text in new_data['Combined_Names']:
    doc = nlp(text)
    biden_contexts = [sent for sent in doc.sents if "Biden" in sent.text]
    for context in biden_contexts:
        if "Joe" in context.text or "President" in context.text:
            print(f"Text: {text}")
            print("Context:", context)
            print("Refers to Joe Biden")


Top TF-IDF Terms: [(array([['leader', 'nozikhungo', 'dialogue', 'ritual', 'interreligious',
        'tshona', 'ceremony', 'seelan', 'gobalsamy', 'sibeko', 'siphiti',
        'charco', 'cranes', 'donati', 'robert', 'masilela', 'makwe',
        'towell', 'körner', 'graeme', 'nader', 'mass', 'dj', 'dineo',
        'ranaka', 'mohammed', 'marina', 'hijab', 'medvin', 'satyendra',
        'sahni', 'singh', 'southern', 'fintech', 'gemalto', 'africa',
        'altron', 'zuraya', 'adhikarie', 'adv', 'barnard', 'heine',
        'lynnette', 'solomon', 'anil', 'merten', 'ferial', 'marianne',
        'moloto', 'haffajee', 'mothapo', 'hitler', 'adolf', 'lagardien',
        'andile', 'pot', 'ismail', 'irwin', 'pol', 'mngxitama', 'jim',
        'sarah', 'yenesel', 'bendjama', 'amar', 'ursula', 'leyen', 'von',
        'der', 'khanyi', 'connie', 'karabo', 'ferguson', 'mbau',
        'moroka', 'jessica', 'wilmot', 'white', 'jake', 'coombs',
        'andrew', 'bill', 'poet', 'renowned', 'jose', 'file', 'af

Unnamed: 0,Article_Date_Published,Article_Body,Article_Content_People_AI_Model,Article_Content_Entities_AI_Model,Article_Source,Voice,Article_Themes_AI_Model,Article_Subject_Keyword_Identified,Article_Topic_Keyword_Identified,Combined_Names,Processed_Names
0,6/11/2024 7:34,Reading Time: 3 minutes\nIsrael’s parliament m...,Benny Gantz|Benjamin Netanyahu|Yoav Gallant|Ga...,Knesset|Gantz|Defence|Gantz|Resented|the Israe...,sabcnews.com,,"[Primary: Conflict, war and peace|92% |Seconda...",,Armed Conflict|Crime & Terrorism|State Security,Benny Gantz|Benjamin Netanyahu|Yoav Gallant|Ga...,benny gantz benjamin netanyahu yoav gallant ga...
1,6/11/2024 5:10,ANC’s aggrieved PEC members draw a line in the...,Mathabatha|Stan Mathabatha|Reuben Madadzhe|Reg...,ANC|PEC|Limpopo ANC|ANC|PEC|the Electoral Comm...,citizen.co.za,Alex Japho Matlala,[Primary: Society|67% |Secondary: Uncategorize...,ANC,,Mathabatha|Stan Mathabatha|Reuben Madadzhe|Reg...,mathabatha stan mathabatha reuben madadzhe reg...
2,6/11/2024 5:00,Daily news update: Axed MK founder alone at Pa...,Axed MK|Jabulani Khumalo|Ajay Gupta|Lethabo Le...,Parliament | SA|| Court|Ace’s|WeSizwe party|MK...,citizen.co.za,Nicholas Zaal,"[Primary: Economy, business and finance|51% |S...",Cape Town|Johannesburg|South Africa,Armed Conflict|Civil Unrest & Protest|Crime & ...,Axed MK|Jabulani Khumalo|Ajay Gupta|Lethabo Le...,axed mk jabulani khumalo ajay gupta lethabo le...
3,6/11/2024 4:52,Search continues for plane carrying Malawi VP ...,Saulos Chilima|Saulos Klaus|Saulos Klaus|Valen...,Office of the President|Cabinet|the Malawi Def...,citizen.co.za,Faizel Patel,[Primary: Politics|52% |Secondary: Uncategoriz...,,State Security,Saulos Chilima|Saulos Klaus|Saulos Klaus|Valen...,saulos chilima saulos klaus saulos klaus valen...
4,6/11/2024 4:40,Axing corrupt Tshwane cops: ‘more action neede...,Axing|Nigel Sibanda/|Grandi Theunissen|Jaco Ba...,TMPD|Department|Tshwane Metro Police Departmen...,citizen.co.za,Marizka Coetzer,"[Primary: Crime, law and justice|100% |Seconda...",South Africa,Crime & Terrorism|State Security,Axing|Nigel Sibanda/|Grandi Theunissen|Jaco Ba...,axing nigel sibanda grandi theunissen jaco bar...
...,...,...,...,...,...,...,...,...,...,...,...
195,6/10/2024 19:37,"In a distressing development, a military aircr...",Saulos Chilima|Saulos Chilima|Lazarus Chakwera...,Malawi Defense Force,surgezirc.co.za,,[Primary: Politics|80% |Secondary: Uncategoriz...,DA,State Security,Saulos Chilima|Saulos Chilima|Lazarus Chakwera...,saulos chilima saulos chilima lazarus chakwera...
196,6/10/2024 19:35,The City of Tshwane says it will implement loa...,Lindela Mashigo|eNCA|Lehakwe Tlali,Soweto Gospel Choir,news365.co.za,Nelly Ndlovu,"[Primary: Economy, business and finance|95% |S...",South Africa,,Lindela Mashigo|eNCA|Lehakwe Tlali Soweto Gosp...,lindela mashigo enca lehakwe tlali soweto gosp...
197,6/10/2024 19:35,Chief justice Raymond Zondo has declared June ...,Raymond Zondo|Zondo,the National Assembly|the National Council of ...,businesslive.co.za,,[Primary: Politics|94% |Secondary: Government ...,ANC|DA,,Raymond Zondo|Zondo the National Assembly|the ...,raymond zondo zondo national assembly national...
198,6/10/2024 19:32,The Department of Transport says it is on trac...,Gemalto Altron Fintech Southern Africa,The Department of Transport|Cabinet|SCM|Corpor...,economy24.co.za,,"[Primary: Economy, business and finance|82% |S...",South Africa,,Gemalto Altron Fintech Southern Africa The Dep...,gemalto altron fintech southern africa departm...
