# ADS 509 Text Mining Project

**Lorena Dorado & Parisa Kamizi** 

## Load and Explore the Dataset Structure

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import glob
import os
import html
import re
import string
import json
import logging
from datetime import datetime
from collections import Counter
from tqdm import tqdm
from langdetect import DetectorFactory
from typing import List, Dict, Any

# Text processing
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from langdetect import detect
import spacy
from gensim.models import Phrases
from gensim.models.phrases import Phraser

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Topic Modeling
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.lda_model
import pyLDAvis.gensim_models
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models.coherencemodel import CoherenceModel
import gensim.corpora as corpora
from gensim.utils import simple_preprocess

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# Download required NLTK resources
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

In [2]:
# Configure Logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('text_mining.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)
logger.info("Setup complete")

2025-02-22 14:33:38,853 - INFO - Setup complete


In [3]:
# Set up text processing configurations
# stop_words = set(stopwords.words('english')) ADD other languages
stop_words = set(stopwords.words('english') + 
                 stopwords.words('spanish') + 
                 stopwords.words('french') + 
                 stopwords.words('german'))

# Add custom stopwords
custom_stopwords = {'ul', 'li', 'ol', 'div', 'span', 'href', 'src', 'img', 'p', 'br', 'nbsp', 'char', 'id', 'av', 'lv'}
stop_words.update(custom_stopwords)

website_stopwords = ['transparency', 'accept', 'partner', 'click', 'consent', 'cookie', 'policy', 'privacy', 'terms', 'use', 'agreement', 'site']
stop_words.update(website_stopwords)

punctuation_set = set(string.punctuation) - {"#"}  # Keep hashtags
lemmatizer = WordNetLemmatizer()

#### Load Data

In [4]:
# Load the data
column_mapping = {
    'source_name': 'source',
    'publishedAt': 'date',
    'content': 'text',
    'title': 'title',
    'description': 'description'
}

csv_files = glob.glob(os.path.join('.', '*.csv'))
dfs = []

for file in csv_files:
    try:
        temp_df = pd.read_csv(file)
        dfs.append(temp_df)
        logger.info(f"Successfully loaded {file}")
    except Exception as e:
        logger.error(f"Error reading file {file}: {str(e)}")

if not dfs:
    raise ValueError("No CSV files were successfully loaded")

news_df = pd.concat(dfs, ignore_index=True)
news_df = news_df[column_mapping.keys()].rename(columns=column_mapping)
news_df['date'] = pd.to_datetime(news_df['date'])

print("\nInitial Dataset Info:")
print(news_df.info())
print("\nFirst few rows:")
print(news_df.head())

2025-02-22 14:33:38,886 - INFO - Successfully loaded .\Deepseek_Day_Five.csv
2025-02-22 14:33:38,899 - INFO - Successfully loaded .\Deepseek_Day_Four.csv
2025-02-22 14:33:38,899 - INFO - Successfully loaded .\Deepseek_Day_One.csv
2025-02-22 14:33:38,912 - INFO - Successfully loaded .\Deepseek_Day_Three.csv
2025-02-22 14:33:38,912 - INFO - Successfully loaded .\Deepseek_Day_Two.csv



Initial Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 755 entries, 0 to 754
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype              
---  ------       --------------  -----              
 0   source       570 non-null    object             
 1   date         570 non-null    datetime64[ns, UTC]
 2   text         570 non-null    object             
 3   title        570 non-null    object             
 4   description  552 non-null    object             
dtypes: datetime64[ns, UTC](1), object(4)
memory usage: 29.6+ KB
None

First few rows:
             source                      date  \
0       Gizmodo.com 2025-02-04 15:00:56+00:00   
1  Business Insider 2025-02-04 18:25:21+00:00   
2      Substack.com 2025-02-04 21:28:04+00:00   
3  Business Insider 2025-02-04 21:26:32+00:00   
4      heise online 2025-02-04 14:00:00+00:00   

                                                text  \
0  Usually when large language models are given t...   
1

## Data Cleaning with Tokenization and Normalization

In [5]:
#### Text processing pipeline functions
def validate_text(text):
    """Check if text is valid"""
    return "" if not isinstance(text, str) or pd.isna(text) else text

def basic_clean(text):
    """Basic text cleaning"""
    return (html.unescape(text)
            .lower()
            .replace('\n', ' ')
            .replace('\r', ' '))

def remove_special_chars(text):
    """Remove special characters and patterns"""
    return re.sub(r'\[\+\d+ chars\]|https?://\S+|â€™|â€"|[^\w\s\-\'.,!?]', ' ', text)

def remove_num_patterns(text):
    """Remove date patterns and numbers (including those with commas)"""
    return re.sub(r'\b\d{1,4}[-/]\d{1,2}[-/]\d{1,4}\b|\b\d{1,3}(,\d{3})*(\.\d+)?\b|\b\d+\b', '', text)

def clean_whitespace(text):
    """Clean extra whitespace"""
    return re.sub(r'\s+', ' ', text).strip()

def remove_punct(text):
    """Remove punctuation"""
    return "".join(ch for ch in text if ch not in punctuation_set)

def get_tokens(text):
    """Get tokens without stopwords, lemmatize, remove numbers, and filter short tokens"""
    doc = nlp(text)
    return [token.lemma_.lower() for token in doc 
            if token.is_alpha 
            and len(token.text) > 2 
            and not token.is_stop 
            and token.lemma_.lower() not in stop_words 
            and not token.like_num]

def detect_lang(text):
    """Detect language safely"""
    try:
        return 'unknown' if len(text.strip()) < 50 else detect(text)
    except:
        return 'unknown'

def remove_duplicates(df):
    """Remove duplicate articles based on content similarity"""
    df['text_signature'] = df['title'] + df['text'].str[:200]
    original_len = len(df)
    df = df.drop_duplicates(subset=['text_signature'])
    df = df.drop('text_signature', axis=1)
    logger.info(f"Removed {original_len - len(df)} duplicate articles")
    return df

In [6]:
def process_text(text, steps=None):
    if steps is None:
        steps = ['validate', 'basic', 'special', 'num_patterns', 'whitespace', 'punct', 'tokens']
    
    text = validate_text(text)
    
    pipeline_steps = {
        'validate': validate_text,
        'basic': basic_clean,
        'special': remove_special_chars,
        'num_patterns': remove_num_patterns,
        'whitespace': clean_whitespace,
        'punct': remove_punct,
        'tokens': get_tokens
    }
    
    for step in steps:
        text = pipeline_steps[step](text)
    
    return text if isinstance(text, list) else text  # Return tokens as a list if the last step was tokenization

In [7]:
# Remove duplicates
logger.info("Removing duplicates...")
news_df = remove_duplicates(news_df)

# Process text
logger.info("Processing text...")
tqdm.pandas()
news_df['clean_text'] = news_df['text'].progress_apply(
    lambda x: process_text(x, ['validate', 'basic', 'special', 'num_patterns', 'whitespace']))

# Detect languages
logger.info("Detecting languages...")
news_df['language'] = news_df['clean_text'].progress_apply(detect_lang)

# Create cleaned content
logger.info("Creating cleaned content...")
news_df['cleaned_content'] = news_df['text'].progress_apply(
    lambda x: process_text(x, ['validate', 'basic', 'punct', 'whitespace']))

# Generate tokens
logger.info("Generating tokens...")
def get_cleaned_text_and_tokens(text):
    cleaned = process_text(text, ['validate', 'basic', 'punct', 'whitespace'])
    tokens = get_tokens(cleaned)
    return ' '.join(tokens), tokens

temp_results = news_df['text'].progress_apply(get_cleaned_text_and_tokens)
news_df['Cleaned_Text'] = temp_results.apply(lambda x: x[0])
news_df['Tokens'] = temp_results.apply(lambda x: x[1])

# Display processed data sample
print("\nProcessed DataFrame columns:")
print(news_df.columns.tolist())
print("\nSample of processed data:")
print(news_df[['source', 'date', 'clean_text', 'language', 'Tokens']].head())

2025-02-22 14:33:38,977 - INFO - Removing duplicates...
2025-02-22 14:33:38,977 - INFO - Removed 286 duplicate articles
2025-02-22 14:33:38,977 - INFO - Processing text...
100%|█████████████████████████████████████████████████████████████████████████████| 469/469 [00:00<00:00, 24469.21it/s]
2025-02-22 14:33:39,012 - INFO - Detecting languages...
100%|███████████████████████████████████████████████████████████████████████████████| 469/469 [00:03<00:00, 125.18it/s]
2025-02-22 14:33:42,759 - INFO - Creating cleaned content...
100%|█████████████████████████████████████████████████████████████████████████████| 469/469 [00:00<00:00, 30240.25it/s]
2025-02-22 14:33:42,790 - INFO - Generating tokens...
100%|███████████████████████████████████████████████████████████████████████████████| 469/469 [00:03<00:00, 117.72it/s]


Processed DataFrame columns:
['source', 'date', 'text', 'title', 'description', 'clean_text', 'language', 'cleaned_content', 'Cleaned_Text', 'Tokens']

Sample of processed data:
             source                      date  \
0       Gizmodo.com 2025-02-04 15:00:56+00:00   
1  Business Insider 2025-02-04 18:25:21+00:00   
2      Substack.com 2025-02-04 21:28:04+00:00   
3  Business Insider 2025-02-04 21:26:32+00:00   
4      heise online 2025-02-04 14:00:00+00:00   

                                          clean_text language  \
0  usually when large language models are given t...       en   
1  meta's cto, andrew bosworth, talked about deep...       en   
2  the release of deepseek has upset american tec...       en   
3  sundar pichai speaks during a google i o confe...       en   
4  inhaltsverzeichnis der japanische technologiek...       de   

                                              Tokens  
0  [usually, large, language, model, give, test, ...  
1  [metas, cto, andrew, b




## Descriptive Statistics

In [8]:
def descriptive_stats_all(df: pd.DataFrame, tokens_col: str = 'Tokens', 
                         text_col: str = 'text', title: str = "Dataset",
                         num_tokens: int = 5, plot: bool = True) -> Dict[str, Any]:

    # Initialize results dictionary
    stats = {}
    
    # Get all tokens
    all_tokens = [token for tokens in df[tokens_col] for token in tokens]
    token_counts = Counter(all_tokens)
    
    # Basic token statistics
    stats['total_tokens'] = len(all_tokens)
    stats['unique_tokens'] = len(set(all_tokens))
    stats['total_characters'] = len(''.join(all_tokens))
    stats['lexical_diversity'] = stats['unique_tokens'] / stats['total_tokens'] if stats['total_tokens'] > 0 else 0
    
    # Document statistics
    stats['total_documents'] = len(df)
    
    # Token length statistics
    token_lengths = [len(token) for token in all_tokens]

    # Top tokens
    stats['top_tokens'] = token_counts.most_common(num_tokens)
    
    # Print results if verbose
    print(f"\nDescriptive Statistics for '{title}':")
    print("-" * 50)
    print(f"Total documents: {stats['total_documents']:,}")
    
    print(f"\nToken-level Statistics:")
    print(f"Total tokens: {stats['total_tokens']:,}")
    print(f"Unique tokens: {stats['unique_tokens']:,}")
    print(f"Total characters: {stats['total_characters']:,}")
    print(f"Lexical diversity: {stats['lexical_diversity']:.3f}")
    
    print(f"\nTop {num_tokens} most frequent tokens:")
    for token, count in stats['top_tokens']:
        print(f"{token}: {count:,}")
    
    return stats

stats = descriptive_stats_all(news_df, tokens_col='Tokens', text_col='text',
                              title="News Articles", num_tokens=10, plot=True)


Descriptive Statistics for 'News Articles':
--------------------------------------------------
Total documents: 469

Token-level Statistics:
Total tokens: 7,307
Unique tokens: 3,493
Total characters: 49,720
Lexical diversity: 0.478

Top 10 most frequent tokens:
deepseek: 415
openai: 132
model: 77
chatgpt: 62
chinese: 58
nvidia: 57
artificial: 56
startup: 44
tech: 43
china: 39


## Topic Modeling

In [9]:
# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words=list(stop_words), max_features=5000, max_df=0.95, min_df=2)
tfidf_matrix = tfidf_vectorizer.fit_transform(news_df['clean_text'])

# Create Count vectorizer
count_vectorizer = CountVectorizer(stop_words=list(stop_words), max_features=5000, max_df=0.95, min_df=2)
count_matrix = count_vectorizer.fit_transform(news_df['clean_text'])

In [10]:
# Function to display topics
def display_topics(model, features, no_top_words=5):
    for topic, words in enumerate(model.components_):
        total = words.sum()
        largest = words.argsort()[::-1] # invert sort order
        print("\nTopic %02d" % topic)
        for i in range(0, no_top_words):
            print("  %s (%2.2f)" % (features[largest[i]], abs(words[largest[i]]*100.0/total)))

## NMF

In [11]:
# Fit NMF model
n_topics = 5
nmf_model = NMF(n_components=n_topics, random_state=42)
nmf_output = nmf_model.fit_transform(tfidf_matrix)

# Display the topics
feature_names = tfidf_vectorizer.get_feature_names_out()
print("NMF Topics:")
display_topics(nmf_model, tfidf_vectorizer.get_feature_names_out())

NMF Topics:

Topic 00
  deepseek (23.87)
  r1 (6.87)
  chatgpt (3.60)
  nvidia (2.62)
  v3 (1.48)

Topic 01
  partners (9.61)
  iab (9.61)
  words (9.61)
  information (9.49)
  part (9.46)

Topic 02
  openai (12.64)
  o3 (4.69)
  mini (4.65)
  altman (4.11)
  chatgpt (4.09)

Topic 03
  chinese (2.21)
  model (2.10)
  tech (1.63)
  new (1.46)
  startup (1.31)

Topic 04
  artificial (4.38)
  inteligencia (4.19)
  ia (4.17)
  modelo (3.01)
  modelos (2.14)


## LSA

In [12]:
# Fit LSA model
lsa_model = TruncatedSVD(n_components=n_topics, random_state=42)
lsa_output = lsa_model.fit_transform(tfidf_matrix)

print("\nLSA Topics:")
display_topics(lsa_model, tfidf_vectorizer.get_feature_names_out())


LSA Topics:

Topic 00
  deepseek (8.90)
  r1 (2.93)
  openai (2.63)
  chatgpt (2.19)
  nvidia (1.16)

Topic 01
  iab (10.82)
  partners (10.82)
  words (10.82)
  information (10.67)
  part (10.65)

Topic 02
  openai (11.14)
  mini (4.91)
  o3 (4.90)
  altman (3.96)
  chatgpt (3.14)

Topic 03
  chinese (2.46)
  model (2.10)
  tech (1.84)
  new (1.60)
  startup (1.47)

Topic 04
  ia (10.43)
  artificial (10.32)
  inteligencia (10.04)
  modelo (7.05)
  china (5.28)


## LDA

In [13]:
# Fit LDA model
lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda_output = lda_model.fit_transform(count_matrix)

print("\nLDA Topics:")
display_topics(lda_model, count_vectorizer.get_feature_names_out())


LDA Topics:

Topic 00
  deepseek (3.51)
  di (1.95)
  including (1.62)
  week (1.60)
  new (1.44)

Topic 01
  deepseek (11.19)
  nvidia (3.39)
  r1 (2.06)
  artificial (1.48)
  chatgpt (1.24)

Topic 02
  deepseek (7.96)
  ia (2.27)
  inteligencia (1.95)
  artificial (1.94)
  modelo (1.61)

Topic 03
  deepseek (8.98)
  openai (6.84)
  chatgpt (2.32)
  mini (2.13)
  o3 (1.97)

Topic 04
  deepseek (4.20)
  openai (2.14)
  intelligence (2.05)
  artificial (1.90)
  ki (1.72)


In [14]:
# Visualize LDA results
pyLDAvis.enable_notebook()
lda_vis = pyLDAvis.lda_model.prepare(lda_model, count_matrix, count_vectorizer)
pyLDAvis.display(lda_vis)

The LDA visualization show five distinct topics, with Topic 1 being the most prevalent and Topics 2 and 5 showing some overlap. The most salient terms include AI-related words like "openai," "nvidia," and "deepseek," alongside technical and business-related terms, indicating a focus on AI companies, technologies, and partnerships. Term frequency analysis highlights "deepseek" as the most frequent term. The marginal topic distribution shows the relative importance of different topics within the corpus.

## Model Evaluation

The higher coherence score will be selected as the best model as it indicates more interpretable topics.

In [15]:
# Calculate coherence scores for each model
def calculate_coherence(model, feature_names, doc_term_matrix):
    coherence_scores = []
    for topic in model.components_:
        top_words = [feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]
        word_indices = [list(feature_names).index(word) for word in top_words]
        topic_vectors = doc_term_matrix[:, word_indices]
        pairwise_similarities = cosine_similarity(topic_vectors.T)
        coherence = pairwise_similarities.mean()
        coherence_scores.append(coherence)
    return coherence_scores

nmf_coherence = calculate_coherence(nmf_model, tfidf_vectorizer.get_feature_names_out(), tfidf_matrix)
lsa_coherence = calculate_coherence(lsa_model, tfidf_vectorizer.get_feature_names_out(), tfidf_matrix)
lda_coherence = calculate_coherence(lda_model, count_vectorizer.get_feature_names_out(), count_matrix)

print("\nCoherence Scores:")
print(f"NMF: {np.mean(nmf_coherence):.4f}")
print(f"LSA: {np.mean(lsa_coherence):.4f}")
print(f"LDA: {np.mean(lda_coherence):.4f}")


Coherence Scores:
NMF: 0.3922
LSA: 0.3927
LDA: 0.2323


Use the most frequent words to try to discern what semantic groups the unsupervised topics might have identified

In [16]:
# Extract top words and their frequencies for each topic
n_top_words = 15
topic_word_freq = []

for topic_idx, topic in enumerate(lda_model.components_):
    top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
    top_features = tfidf_vectorizer.get_feature_names_out()[top_features_ind]
    weights = topic[top_features_ind]
    topic_word_freq.append(dict(zip(top_features, weights)))

# Calculate the relevance of each word across all topics
word_topic_relevance = {}
for topic_idx, word_freq in enumerate(topic_word_freq):
    for word, freq in word_freq.items():
        if word not in word_topic_relevance:
            word_topic_relevance[word] = []
        word_topic_relevance[word].append((topic_idx, freq))
        
# Identify semantic groups based on word relevance across topics
threshold = 0.05
semantic_groups = {}
for word, relevances in word_topic_relevance.items():
    relevant_topics = [topic for topic, score in relevances if score >= threshold]
    if len(relevant_topics) > 1:
        group_name = f"Group_{'-'.join(map(str, relevant_topics))}"
        if group_name not in semantic_groups:
            semantic_groups[group_name] = []
        semantic_groups[group_name].append(word)

# Display the identified semantic groups with interpretations
topic_interpretations = [
    "Topic 1",
    "Topic 2",
    "Topic 3",
    "Topic 4",
    "Topic 5"
]

print("\nIdentified semantic groups:")
for group, words in semantic_groups.items():
    topics = [int(t) for t in group.split('_')[1].split('-')]
    interpretations = [topic_interpretations[t] for t in topics]
    print(f"\n{group}: {', '.join(interpretations)}")
    print(f"Words: {', '.join(words)}")


Identified semantic groups:

Group_0-1-2-3-4: Topic 1, Topic 2, Topic 3, Topic 4, Topic 5
Words: deepseek

Group_0-3: Topic 1, Topic 4
Words: tech

Group_0-3-4: Topic 1, Topic 4, Topic 5
Words: chinese

Group_1-2-3-4: Topic 2, Topic 3, Topic 4, Topic 5
Words: r1, china

Group_1-2-4: Topic 2, Topic 3, Topic 5
Words: artificial, ia

Group_1-3: Topic 2, Topic 4
Words: chatgpt

Group_1-2: Topic 2, Topic 3
Words: modelo, inteligencia

Group_1-4: Topic 2, Topic 5
Words: intelligence, apple

Group_2-4: Topic 3, Topic 5
Words: startup

Group_3-4: Topic 4, Topic 5
Words: openai
