In [22]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Load the dataset
data = pd.read_csv('cleandata_processed.csv', nrows=6258)



# Define a simple stopwords list
simple_stopwords = set([
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
    'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself',
    'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
    'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be',
    'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an',
    'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by',
    'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before',
    'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over',
    'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
    'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no',
    'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will',
    'just', 'don', 'should', 'now'
])

# Preprocessing function without NLTK
def preprocess_text_simple(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'\W+', ' ', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    words = text.split()
    words = [word for word in words if word not in simple_stopwords]  # Remove stopwords
    return ' '.join(words)

# Apply preprocessing to the Article_Body column
data['Processed_Article_Body'] = data['Article_Body'].apply(preprocess_text_simple)

# Initialize the TF-IDF vectorizer with a maximum number of features
vectorizer = TfidfVectorizer(max_features=1000)

# Fit and transform the processed article body text
tfidf_matrix = vectorizer.fit_transform(data['Processed_Article_Body'])

# Convert the TF-IDF matrix to a DataFrame for better readability
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Save the TF-IDF DataFrame to a CSV file
tfidf_df.to_csv('tfidf_analysis_results.csv', index=False)

# Function to get the top N terms with the highest TF-IDF scores
def get_top_n_terms(tfidf_matrix, feature_names, top_n=10):
    summed_tfidf = tfidf_matrix.sum(axis=0)
    sorted_indices = np.argsort(summed_tfidf)[::-1][:top_n]
    top_terms = [(feature_names[i], summed_tfidf[0, i]) for i in sorted_indices]
    return top_terms

# Get the feature names
feature_names = vectorizer.get_feature_names_out()

# Get the top 20 terms across all documents
top_terms = get_top_n_terms(tfidf_matrix, feature_names, top_n=20)

# Print the top terms
for term, score in top_terms:
     print(f"Term: {term}, Score: {score[0,0]:.4f}")


Term: [['facd' 'ca' 'rec' 'ee' 'fe' 'caa' 'fff' 'nogap' 'eab' 'fcb' 'inset'
  'bbfa' 'tdi_' 'ea' 'webkit' 'rgba' 'koo' 'flipboard' 'viber' 'tumblr'
  'tabs' 'notext' 'kakao' 'naver' 'gettr' 'cf' 'pinterest' 'cde' 'vk'
  'digg' 'stumbleupon' 'handler' 'reddit' 'telegram' 'copy_url' 'font'
  'ec' 'arrow' 'linkedin' 'width' 'radius' 'padding' 'colored'
  'transparent' 'bar' 'transform' 'opacity' 'align' 'margin' 'shadow'
  'relative' 'bg' 'solid' 'box' 'absolute' 'background' 'height' 'none'
  'index' 'expand' 'whatsapp' 'block' 'medicine' 'transition' 'display'
  'print' 'style' 'uct' 'bottom' 'max' 'px' 'sharing' 'middle' 'hiv'
  'text' 'color' 'check' 'either' 'debuts' 'raid' 'size' 'mngomezulu'
  'canon' 'stand' 'leicester' 'schools' 'camps' 'dm' 'tigers' 'built'
  'target' 'word' 'ministry' 'eagles' 'closely' 'featuring' 'backs'
  'wing' 'main' 'fouche' 'ons' 'foreign' 'groups' 'twickenham' 'southern'
  'vodacom' 'jasper' 'forwards' 'exactly' 'uk' 'matters' 'heart' 'border'
  'dixon'

In [None]:
line_number = 6260

with open('cleandata_processed.csv', 'r') as file:
    for current_line_number, line in enumerate(file, start=1):
        if current_line_number == line_number:
            print(line)
            break

The devices fitted to Hera and Royal Wasi weigh just 3.2 grams each and are powered by tiny solar panels roughly the size of the nail on a human pinkie finger.



In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import numpy as np

# Load the pre-trained spaCy model
nlp = spacy.load("en_core_web_sm")

# Load your dataset
new_data = pd.read_csv('cleandata_processed.csv', nrows=6258)  # Ensure 'data_processed.csv' is in the same directory

# Combine relevant columns
new_data['Combined_Names'] = new_data['Name'].fillna('') + ' ' + new_data['Article_Themes_AI_Model'].fillna('')

# Define a simple stopwords list
simple_stopwords = set([
    'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with',
    'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from',
    'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there',
    'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such',
    'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don',
    'should', 'now'
])

# Preprocess text
def preprocess_text_simple(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'\W+', ' ', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    words = text.split()
    words = [word for word in words if word not in simple_stopwords]  # Remove stopwords
    return ' '.join(words)

new_data['Processed_Names'] = new_data['Combined_Names'].apply(preprocess_text_simple)

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=1000)
tfidf_matrix = vectorizer.fit_transform(new_data['Processed_Names'])
feature_names = vectorizer.get_feature_names_out()

# Get top N terms
def get_top_n_terms(tfidf_matrix, feature_names, top_n=10):
    summed_tfidf = tfidf_matrix.sum(axis=0)
    sorted_indices = np.argsort(summed_tfidf.A1)[::-1][:top_n]

    top_terms = [(feature_names[i], summed_tfidf[0, i]) for i in sorted_indices]
    return top_terms

top_terms = get_top_n_terms(tfidf_matrix, feature_names, top_n=20)

# Print top terms in "term-value, term-value, etc." format
top_terms_str = ', '.join([f"{term}-{float(value):.4f}" for term, value in top_terms])
print("Top TF-IDF Terms:", top_terms_str)


Top TF-IDF Terms: secondary-1083.2037, primary-1083.2037, politics-501.8300, uncategorized-482.0231, sport-479.3240, entertainment-337.1080, arts-337.1080, human-336.4550, interest-307.4623, culture-298.7359, law-288.7092, crime-278.2821, economy-264.1351, emergency-252.2693, political-251.1274, media-248.6320, justice-245.7942, business-238.0946, incident-235.2170, government-201.6397


In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import numpy as np

# Load the pre-trained spaCy model
nlp = spacy.load("en_core_web_sm")

# Load your dataset
new_data = pd.read_csv('cleandata_processed.csv', nrows=6258)  # Ensure 'data_processed.csv' is in the same directory

# Combine relevant columns
new_data['Combined_Names'] = new_data['Name'].fillna('') + ' ' + new_data['Article_Themes_AI_Model'].fillna('')

# Define a simple stopwords list
simple_stopwords = set([
    'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with',
    'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from',
    'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there',
    'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such',
    'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don',
    'should', 'now'
])

# Preprocess text
def preprocess_text_simple(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'\W+', ' ', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    words = text.split()
    words = [word for word in words if word not in simple_stopwords]  # Remove stopwords
    return ' '.join(words)

new_data['Processed_Names'] = new_data['Combined_Names'].apply(preprocess_text_simple)

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=1000)
tfidf_matrix = vectorizer.fit_transform(new_data['Processed_Names'])
feature_names = vectorizer.get_feature_names_out()

# Preprocess names similarly
new_data['Processed_Individual_Names'] = new_data['Name'].apply(preprocess_text_simple)
processed_names = new_data['Processed_Individual_Names'].unique()

# Assign TF-IDF scores to names
tfidf_scores = {}

for name in processed_names:
    if name in feature_names:
        name_index = np.where(feature_names == name)[0][0]
        tfidf_scores[name] = tfidf_matrix[:, name_index].sum()

# Print TF-IDF scores for names
for name, score in tfidf_scores.items():
    print(f"{name}: {score:.4f}")


theunissen: 2.7976
eskom: 2.9184
ramaphosa: 61.0263
ramp: 45.9069
steyn: 24.2435
chakwera: 5.0601
amakhosi: 2.7085
mpho: 4.8082
magashule: 4.7242
aslan: 1.9962
alcaraz: 6.3239
biden: 12.0605
bill: 2.3538
bush: 3.7468
macron: 17.5503
kolbe: 17.2976
leinster: 7.7263
gumede: 8.9691
vinicius: 7.0372
ajay: 3.1580
anil: 2.1573
gupta: 7.0514
zuma: 48.9237
malema: 17.6601
mdluli: 2.3871
cosatu: 2.3252
dladlu: 2.1753
khayelitsha: 7.4987
bok: 3.1369
bester: 3.1229
lebo: 8.1412
makhene: 11.0500
letoya: 10.2714
bafana: 29.8229
broos: 6.2538
abahambe: 1.4662
mckenzie: 7.0301
maharaj: 4.6306
alexforbes: 2.9628
koko: 3.7148
gosiame: 2.1824
mangena: 7.9714
miller: 13.1593
dj: 3.6446
infantino: 4.3525
wenger: 1.8459
shivambu: 6.9777
alexandra: 2.2169
guterres: 2.8793
zondo: 18.2005
sizwe: 1.6354
lebohang: 14.1680
benni: 2.3030
percy: 2.3058
nonku: 2.7325
denny: 1.9694
lawrence: 1.0281
shein: 2.7254
temu: 2.7254
khumalo: 12.5843
elizabeth: 2.1473
anthony: 2.7527
chilima: 3.9341
twerka: 4.9648
chauke: 10

In [29]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Load your dataset
new_data = pd.read_csv('cleandata_processed.csv', nrows=6259)  # Ensure 'cleandata_processed.csv' is in the same directory

# Combine relevant columns
new_data['Combined_Names'] = new_data['Name'].fillna('') + ' ' + new_data['Article_Themes_AI_Model'].fillna('')

# Define a simple stopwords list
simple_stopwords = set([
    'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with',
    'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from',
    'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there',
    'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such',
    'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don',
    'should', 'now'
])

# Preprocess text
def preprocess_text_simple(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'\W+', ' ', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    words = text.split()
    words = [word for word in words if word not in simple_stopwords]  # Remove stopwords
    return ' '.join(words)

new_data['Processed_Names'] = new_data['Combined_Names'].apply(preprocess_text_simple)

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=1000)
tfidf_matrix = vectorizer.fit_transform(new_data['Processed_Names'])
feature_names = vectorizer.get_feature_names_out()

# Preprocess names similarly and include only those with more than two words but fewer than four words
def preprocess_name(name):
    processed_name = preprocess_text_simple(name)
    word_count = len(processed_name.split())
    if 2 < word_count:  # Include only names with more than two words but fewer than four words
        return processed_name
    return ""

new_data['Processed_Individual_Names'] = new_data['Name'].apply(preprocess_name)
processed_names = new_data['Processed_Individual_Names'].unique()
processed_names = [name for name in processed_names if name]  # Remove empty strings

# Debug: Print processed names to verify
print("Processed Names:", processed_names)

# Assign TF-IDF scores to names
tfidf_scores = {}

for name in processed_names:
    name_terms = name.split()
    for term in name_terms:
        if term in feature_names:
            term_index = np.where(feature_names == term)[0][0]
            score = tfidf_matrix[:, term_index].sum()
            if name in tfidf_scores:
                tfidf_scores[name] += score
            else:
                tfidf_scores[name] = score

# Debug: Print tfidf_scores to verify
print("TF-IDF Scores:", tfidf_scores)

# Print TF-IDF scores for names
for name, score in tfidf_scores.items():
    print(f"{name}: {score:.4f}")


Processed Names: ['gerda gerda steyn', 'darren stewart gallo images', 'al jama ah', 'gemalto altron fintech southern africa', 'lood de jager jean kleyn', 'kurt lee arendse', 'nomusa dube ncube', 'saulos klaus chilima', 'lazarus chakwera search', 'marikana https co eipgdydxtr ramaphosa', 'marikana https co eipgdydxtr', 'linda thomas greenfield', 'omar al bashir', 'paul kruger gate', 'ursula von der leyen', 'frennie shivambu gallo images', 'renowned poet mzwakhe mbuli', 'satyendra singh sahni', 'geordin hill lewis', 'john maynard keynes', 'gossipmonger musa khawula', 'dawie de villiers', 'richard huggard gallo images', 'pixley ka isaka', 'adv zuraya adhikarie', 'lethabo lejoy mathato', 'md grant orsmond', 'jose jordan afp file vinicius junior', 'lungi de ville', 'captian fc van wyk', 'lethabo lejoy mathatho', 'maegan leigh jacobs', 'skhothane sa pitori', 'kabza de small', 'fw de klerk', 'majlis e ittehadul muslimeen', 'n biren singh', 'rex van schalkwyk', 'des van rooyen', 'von der leyen