# Word2Vec Analysis for News Articles

This notebook implements Word2Vec embeddings for analyzing news articles from Articles.csv dataset.

## Features:
- Text preprocessing and cleaning
- Word2Vec model training
- Document similarity analysis
- Clustering and visualization
- Temporal analysis

## 1. Import Libraries and Setup

In [1]:
# Install required packages (run once)
# !pip install gensim nltk scikit-learn matplotlib seaborn wordcloud

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
warnings.filterwarnings('ignore')

# NLP libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Word2Vec and ML libraries
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

# Download NLTK data
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)

# Set display options
pd.set_option('max_colwidth', 100)
plt.style.use('default')
sns.set_palette("husl")

print("‚úì Libraries imported successfully!")

## 2. Load and Explore Dataset

In [None]:
# Load the articles dataset
df = pd.read_csv('Articles.csv')

print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nMissing values:\n{df.isnull().sum()}")

# Display first few rows
df.head()

In [None]:
# Explore the data structure
print("Sample article text:")
print(df['Article text'].iloc[0][:500] + "...")

print(f"\nAverage article length: {df['Article text'].str.len().mean():.0f} characters")
print(f"Median article length: {df['Article text'].str.len().median():.0f} characters")

In [None]:
# Analyze categories and sections
if 'Category' in df.columns:
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    df['Category'].value_counts().plot(kind='bar')
    plt.title('Articles by Category')
    plt.xticks(rotation=45)
    
    plt.subplot(1, 2, 2)
    df['Section'].value_counts().head(10).plot(kind='bar')
    plt.title('Top 10 Sections')
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.show()

## 3. Text Preprocessing

In [None]:
# Initialize preprocessing tools
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """Complete text preprocessing pipeline"""
    if pd.isna(text):
        return []
    
    # Convert to lowercase
    text = str(text).lower()
    
    # Remove special characters, keep only letters
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    # Tokenize and remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words and len(word) > 2]
    
    # Apply stemming
    words = [ps.stem(word) for word in words]
    
    return words

# Test preprocessing
sample_text = df['Article text'].iloc[0][:200]
processed_sample = preprocess_text(sample_text)

print("Original text (first 200 chars):")
print(sample_text)
print("\nProcessed tokens (first 15):")
print(processed_sample[:15])

In [None]:
# Apply preprocessing to all articles
print("Preprocessing all articles...")
df['processed_tokens'] = df['Article text'].apply(preprocess_text)

# Remove empty documents
df = df[df['processed_tokens'].apply(len) > 0].reset_index(drop=True)

print(f"Number of articles after preprocessing: {len(df)}")
print(f"Average tokens per article: {df['processed_tokens'].apply(len).mean():.1f}")
print(f"Median tokens per article: {df['processed_tokens'].apply(len).median():.1f}")

## 4. Word2Vec Model Training

In [None]:
# Prepare sentences for Word2Vec
sentences = df['processed_tokens'].tolist()

# Train Word2Vec model
print("Training Word2Vec model...")
w2v_model = Word2Vec(
    sentences=sentences,
    vector_size=100,      # Dimensionality of word vectors
    window=5,             # Context window size  
    min_count=3,          # Ignore words with frequency less than this
    workers=4,            # Number of worker threads
    epochs=10,            # Number of training epochs
    sg=0                  # 0 for CBOW, 1 for Skip-gram
)

print(f"‚úì Word2Vec model trained successfully!")
print(f"‚úì Vocabulary size: {len(w2v_model.wv.key_to_index)}")
print(f"‚úì Vector dimensionality: {w2v_model.wv.vector_size}")

# Save the model
w2v_model.save('articles_word2vec.model')
print("‚úì Model saved as 'articles_word2vec.model'")

## 5. Word Similarity Analysis

In [None]:
# Find most common words
all_words = []
for tokens in sentences:
    all_words.extend(tokens)

word_freq = pd.Series(all_words).value_counts()
print("Top 20 most frequent words:")
print(word_freq.head(20))

# Visualize word frequencies
plt.figure(figsize=(12, 6))
word_freq.head(20).plot(kind='bar')
plt.title('Top 20 Most Frequent Words')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Test word similarities
test_words = ['econom', 'technolog', 'russia', 'china', 'covid', 'climat', 'energi']
available_words = [word for word in test_words if word in w2v_model.wv.key_to_index]

print(f"Testing similarities for available words: {available_words}")

for word in available_words[:5]:  # Test first 5 available words
    try:
        similar_words = w2v_model.wv.most_similar(word, topn=5)
        print(f"\nüîç Words similar to '{word}':")
        for sim_word, similarity in similar_words:
            print(f"   {sim_word}: {similarity:.3f}")
    except Exception as e:
        print(f"Error finding similarities for '{word}': {e}")

## 6. Document Vector Creation

In [None]:
def get_document_vector(tokens, model, vector_size=100):
    """Get document vector by averaging word vectors"""
    vectors = []
    for token in tokens:
        if token in model.wv.key_to_index:
            vectors.append(model.wv[token])
    
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(vector_size)

# Create document vectors
print("Creating document vectors...")
doc_vectors = []
for tokens in df['processed_tokens']:
    vector = get_document_vector(tokens, w2v_model)
    doc_vectors.append(vector)

doc_vectors = np.array(doc_vectors)
print(f"‚úì Document vectors created: {doc_vectors.shape}")

## 7. Document Clustering

In [None]:
# Perform K-means clustering
n_clusters = 6
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
clusters = kmeans.fit_predict(doc_vectors)

# Add cluster labels to dataframe
df['cluster'] = clusters

print(f"Documents clustered into {n_clusters} groups:")
cluster_counts = pd.Series(clusters).value_counts().sort_index()
print(cluster_counts)

# Visualize cluster distribution
plt.figure(figsize=(10, 6))
cluster_counts.plot(kind='bar')
plt.title('Document Distribution Across Clusters')
plt.xlabel('Cluster')
plt.ylabel('Number of Documents')
plt.xticks(rotation=0)
plt.show()

In [None]:
# Show sample headlines from each cluster
print("üì∞ Sample headlines from each cluster:")
for i in range(n_clusters):
    cluster_docs = df[df['cluster'] == i]
    if len(cluster_docs) > 0:
        print(f"\nüè∑Ô∏è Cluster {i} ({len(cluster_docs)} articles):")
        for j, headline in enumerate(cluster_docs['Headline'].head(3)):
            print(f"   {j+1}. {headline[:80]}...")

## 8. Visualization with PCA

In [None]:
# Dimensionality reduction with PCA
pca = PCA(n_components=2, random_state=42)
doc_vectors_2d = pca.fit_transform(doc_vectors)

# Create main visualization
plt.figure(figsize=(14, 10))

# Main scatter plot
plt.subplot(2, 2, 1)
scatter = plt.scatter(doc_vectors_2d[:, 0], doc_vectors_2d[:, 1], 
                     c=clusters, cmap='tab10', alpha=0.7, s=50)
plt.colorbar(scatter)
plt.title('Document Clusters (Word2Vec + PCA)')
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
plt.grid(True, alpha=0.3)

# Color by category if available
if 'Category' in df.columns:
    plt.subplot(2, 2, 2)
    categories = df['Category'].astype('category')
    scatter = plt.scatter(doc_vectors_2d[:, 0], doc_vectors_2d[:, 1], 
                         c=categories.cat.codes, cmap='Set3', alpha=0.7, s=50)
    plt.title('Documents by Category')
    plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
    plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
    plt.grid(True, alpha=0.3)

# Explained variance
plt.subplot(2, 2, 3)
plt.bar(['PC1', 'PC2'], pca.explained_variance_ratio_)
plt.title('PCA Explained Variance')
plt.ylabel('Explained Variance Ratio')

# Cluster sizes
plt.subplot(2, 2, 4)
cluster_counts.plot(kind='pie', autopct='%1.1f%%')
plt.title('Cluster Distribution')
plt.ylabel('')

plt.tight_layout()
plt.savefig('word2vec_analysis_dashboard.png', dpi=300, bbox_inches='tight')
plt.show()

## 9. Document Similarity Analysis

In [None]:
def find_similar_documents(doc_index, doc_vectors, df, top_n=3):
    """Find most similar documents to a given document"""
    target_vector = doc_vectors[doc_index].reshape(1, -1)
    similarities = cosine_similarity(target_vector, doc_vectors)[0]
    
    # Get indices of most similar documents (excluding the document itself)
    similar_indices = np.argsort(similarities)[::-1][1:top_n+1]
    
    return similar_indices, similarities[similar_indices]

# Test similarity for first few documents
print("üîç Document Similarity Analysis")

for test_idx in [0, 10, 20]:  # Test multiple documents
    if test_idx < len(df):
        print(f"\nüìÑ Document {test_idx}:")
        print(f"Headline: {df.iloc[test_idx]['Headline'][:80]}...")
        
        similar_indices, similarities = find_similar_documents(test_idx, doc_vectors, df, top_n=3)
        
        print("Most similar articles:")
        for i, (idx, sim) in enumerate(zip(similar_indices, similarities)):
            headline = df.iloc[idx]['Headline']
            print(f"  {i+1}. Similarity: {sim:.3f}")
            print(f"     {headline[:80]}...")

## 10. Temporal Analysis

In [None]:
# Convert date column if available
if 'Date published' in df.columns:
    df['date'] = pd.to_datetime(df['Date published'], errors='coerce')
    
    # Group by month
    monthly_counts = df.groupby(df['date'].dt.to_period('M')).size()
    
    print("üìÖ Articles per month:")
    print(monthly_counts.tail(12))  # Show last 12 months
    
    # Plot temporal trends
    plt.figure(figsize=(15, 8))
    
    # Overall trend
    plt.subplot(2, 2, 1)
    monthly_counts.plot(kind='line', marker='o')
    plt.title('Articles Over Time')
    plt.xlabel('Month')
    plt.ylabel('Number of Articles')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)
    
    # Category trends if available
    if 'Category' in df.columns:
        plt.subplot(2, 2, 2)
        category_time = df.groupby([df['date'].dt.to_period('M'), 'Category']).size().unstack(fill_value=0)
        category_time.plot(kind='area', stacked=True)
        plt.title('Categories Over Time')
        plt.xlabel('Month')
        plt.ylabel('Number of Articles')
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.xticks(rotation=45)
    
    # Cluster trends
    plt.subplot(2, 2, 3)
    cluster_time = df.groupby([df['date'].dt.to_period('M'), 'cluster']).size().unstack(fill_value=0)
    cluster_time.plot(kind='line', marker='o')
    plt.title('Clusters Over Time')
    plt.xlabel('Month')
    plt.ylabel('Number of Articles')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)
    
    # Day of week pattern
    plt.subplot(2, 2, 4)
    day_pattern = df['date'].dt.day_name().value_counts()
    day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    day_pattern = day_pattern.reindex(day_order)
    day_pattern.plot(kind='bar')
    plt.title('Articles by Day of Week')
    plt.xlabel('Day')
    plt.ylabel('Total Articles')
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.savefig('temporal_analysis.png', dpi=300, bbox_inches='tight')
    plt.show()

## 11. Advanced Analysis: Topic Keywords per Cluster

In [None]:
# Analyze dominant words in each cluster
print("üè∑Ô∏è Dominant words in each cluster:")

for cluster_id in range(n_clusters):
    cluster_docs = df[df['cluster'] == cluster_id]
    
    # Get all tokens from this cluster
    cluster_tokens = []
    for tokens in cluster_docs['processed_tokens']:
        cluster_tokens.extend(tokens)
    
    # Count word frequencies
    token_freq = pd.Series(cluster_tokens).value_counts()
    
    print(f"\nCluster {cluster_id} ({len(cluster_docs)} articles):")
    print(f"Top keywords: {', '.join(token_freq.head(10).index)}")
    
    # Show category distribution for this cluster
    if 'Category' in df.columns:
        cluster_categories = cluster_docs['Category'].value_counts()
        print(f"Main categories: {dict(cluster_categories.head(3))}")

## 12. Model Evaluation and Summary

In [None]:
# Model summary and statistics
print("üìä WORD2VEC MODEL SUMMARY")
print("=" * 50)
print(f"‚úì Total articles processed: {len(df):,}")
print(f"‚úì Vocabulary size: {len(w2v_model.wv.key_to_index):,} unique words")
print(f"‚úì Vector dimensionality: {w2v_model.wv.vector_size}")
print(f"‚úì Average tokens per article: {df['processed_tokens'].apply(len).mean():.1f}")
print(f"‚úì Documents clustered into: {n_clusters} groups")
print(f"‚úì PCA explained variance: {pca.explained_variance_ratio_.sum():.2%}")

if 'Category' in df.columns:
    print(f"‚úì Categories covered: {df['Category'].nunique()}")
    
if 'Date published' in df.columns:
    date_range = df['date'].max() - df['date'].min()
    print(f"‚úì Time span: {date_range.days} days")

print("\nüéØ KEY INSIGHTS:")
print(f"‚Ä¢ Most frequent words: {', '.join(word_freq.head(5).index)}")
print(f"‚Ä¢ Largest cluster: {cluster_counts.max()} articles (Cluster {cluster_counts.idxmax()})")
print(f"‚Ä¢ Smallest cluster: {cluster_counts.min()} articles (Cluster {cluster_counts.idxmin()})")

print("\nüíæ SAVED FILES:")
print("‚Ä¢ articles_word2vec.model - Trained Word2Vec model")
print("‚Ä¢ word2vec_analysis_dashboard.png - Main visualization")
print("‚Ä¢ temporal_analysis.png - Time series analysis")

print("\nüöÄ NEXT STEPS:")
print("‚Ä¢ Experiment with different vector sizes and window sizes")
print("‚Ä¢ Try Skip-gram vs CBOW models")
print("‚Ä¢ Compare with TF-IDF or other embeddings")
print("‚Ä¢ Build classification models using these embeddings")
print("‚Ä¢ Explore topic modeling with LDA")

## 13. Usage Examples

In [None]:
# Examples of how to use the trained model
print("üìñ USAGE EXAMPLES:")
print("=" * 30)

# 1. Load saved model
print("\n1. Load the saved model:")
print("   from gensim.models import Word2Vec")
print("   model = Word2Vec.load('articles_word2vec.model')")

# 2. Get word vector
if 'econom' in w2v_model.wv.key_to_index:
    vector = w2v_model.wv['econom']
    print(f"\n2. Get word vector (example for 'econom'):")
    print(f"   vector = model.wv['econom']")
    print(f"   Shape: {vector.shape}, First 5 values: {vector[:5]}")

# 3. Find similar words
print("\n3. Find similar words:")
print("   similar = model.wv.most_similar('word', topn=5)")

# 4. Calculate similarity
vocab_words = list(w2v_model.wv.key_to_index.keys())
if len(vocab_words) >= 2:
    word1, word2 = vocab_words[0], vocab_words[1]
    similarity = w2v_model.wv.similarity(word1, word2)
    print(f"\n4. Calculate word similarity:")
    print(f"   similarity = model.wv.similarity('{word1}', '{word2}')")
    print(f"   Result: {similarity:.3f}")

print("\n‚úÖ Analysis complete! Check the saved files and model for further use.")