<a href="https://colab.research.google.com/github/TommasoG85/Collective-intelligence/blob/main/Collective_intelligence_Arxiv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!pip install bertopic

import requests
import xml.etree.ElementTree as ET
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import numpy as np
import umap
import plotly.express as px
import random
from hdbscan import HDBSCAN
from datetime import datetime
from collections import defaultdict

# Set a random seed for reproducibility
random.seed(2)
np.random.seed(2)

# Download the NLTK stopwords
nltk.download('stopwords')

# Get the English stopwords from NLTK
stopwords_nltk = stopwords.words('english')

# Step 1: Fetch Data from arXiv API
def fetch_arxiv_abstracts(query="collective intelligence, social brain, group intelligence, crowd intelligence, social intelligence, collective rationality, complex adaptive systems, superorganism",max_results=50000):
    url = 'http://export.arxiv.org/api/query'
    params = {
        'search_query': f'all:"{query}"',
        'start': 0,
        'max_results': max_results,
        'sortBy': 'relevance',
        "sortOrder": "descending",  # Newest first
    }
    response = requests.get(url, params=params)
    data = response.text

    # Parse XML response
    root = ET.fromstring(data)
    records = []
    for entry in root.findall("{http://www.w3.org/2005/Atom}entry"):
        title = entry.find("{http://www.w3.org/2005/Atom}title").text
        summary = entry.find("{http://www.w3.org/2005/Atom}summary").text
        published = entry.find("{http://www.w3.org/2005/Atom}published").text
        authors = [author.find("{http://www.w3.org/2005/Atom}name").text for author in entry.findall("{http://www.w3.org/2005/Atom}author")]
        url = entry.find("{http://www.w3.org/2005/Atom}id").text

        # Convert 'published' to a datetime object if needed
        published_date = datetime.strptime(published, "%Y-%m-%dT%H:%M:%SZ").date()

        records.append({
            "title": title,
            "authors": ", ".join(authors),
            "abstract": summary,
            "published_date": published,
            "arxiv_url": url
        })

    return pd.DataFrame(records)

# Fetch abstracts with multiple search terms
df = fetch_arxiv_abstracts(query="collective intelligence",max_results=10000)
print("Data fetched from arXiv.")

# Save abstracts to a CSV file (optional)
df.to_csv("arxiv_collective_intelligence.csv", index=False)

# Step 2: Clean the Abstracts by Removing Stopwords
def remove_stopwords(text, stopwords_list):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stopwords_list]
    return " ".join(filtered_words)

# Clean the abstracts
df['cleaned_abstract'] = df['abstract'].apply(lambda x: remove_stopwords(x, stopwords_nltk))

# Step 3: Use BERT for Embedding
# Load a pre-trained SentenceTransformer model for BERT embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for the cleaned abstracts
abstracts = df['cleaned_abstract'].tolist()
embeddings = model.encode(abstracts, show_progress_bar=True)
timestamps = df['published_date'].tolist()
print("BERT embeddings generated.")

# Step 4: Topic Modeling with BERTopic
# Initialize a custom CountVectorizer with stopwords
vectorizer_model = CountVectorizer(stop_words=stopwords_nltk)

# Initialize the clustering model (HDBSCAN) with a fixed random state
# The 'random_state' parameter should be removed from KDTree
hdbscan_model = HDBSCAN(
    min_cluster_size=4, # Controls the minimum number of points to form a cluster (increase to reduce outliers)
    min_samples=5, # Controls the minimum number of points to form a cluster (increase to reduce outliers)
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True
    #random_state=222  # This is fine here
)

# Initialize BERTopic with the custom vectorizer
topic_model = BERTopic(
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    min_topic_size=5,
    verbose=True,
    nr_topics=500,
    top_n_words=5
)

# Fit the model to the cleaned abstracts and embeddings
topics, probabilities = topic_model.fit_transform(abstracts, embeddings)
topics_over_time = topic_model.topics_over_time(abstracts, timestamps)

# Display the assigned topics to check for issues
print("Assigned topics:", topics)
print("Unique topics:", set(topics))

# Step 5: Visualize Topics

# 1. Save the Topic Visualization as HTML
topic_plot = topic_model.visualize_topics()
topic_plot.write_html("topic_visualization.html")
from google.colab import files
files.download("topic_visualization.html")

# 2. Save the Hierarchy Visualization as HTML
hierarchy_plot = topic_model.visualize_hierarchy()
hierarchy_plot.write_html("hierarchy_visualization.html")
files.download("hierarchy_visualization.html")

# 3. Save the Heatmap Visualization as HTML
heatmap_plot = topic_model.visualize_heatmap()
heatmap_plot.write_html("heatmap_visualization.html")
files.download("heatmap_visualization.html")

# 4. Save the Bar Chart Visualization as HTML
barchart_plot = topic_model.visualize_barchart()
barchart_plot.write_html("barchart_visualization.html")
files.download("barchart_visualization.html")

# Step 1: Reduce dimensionality to 2D using UMAP
umap_model = umap.UMAP(n_neighbors=30, min_dist=0.1, metric='cosine', n_components=2)
umap_embeddings = umap_model.fit_transform(embeddings)

# Step 2: Create a DataFrame with UMAP coordinates, topic labels, and topic names
topics_df = pd.DataFrame({
    'UMAP_1': umap_embeddings[:, 0],
    'UMAP_2': umap_embeddings[:, 1],
    'Topic': topics
})

# Step 3: Generate topic names based on top words for each topic
topic_names = []
for i in range(max(topics) + 1):  # Iterate over each topic number
    top_words = topic_model.get_topic(i)  # Get top words for the topic
    top_words = [word for word, _ in top_words]  # Extract words from the topic
    topic_names.append(" ".join(top_words))  # Join them into a single string

# Map topic names to the topics
topics_df['Topic Name'] = topics_df['Topic'].map(lambda x: topic_names[x])

# Step 4: Create a Plotly 2D scatter plot
fig = px.scatter(topics_df, x='UMAP_1', y='UMAP_2', color='Topic Name',
                 hover_data=['Topic', 'Topic Name'], title="2D Topic Scatter Plot (Colorized by Topics)")

# Step 6: Save the plot as an interactive HTML file
fig.write_html("topic_scatter_2d_plot_with_lines.html")

# Download the file in Colab
files.download("topic_scatter_2d_plot_with_lines.html")

# Create and save temporal visualization as an HTML file
temporal_plot = topic_model.visualize_topics_over_time(topics_over_time)
temporal_plot.write_html("temporal_visualization.html")

# Download the HTML file if needed
files.download("temporal_visualization.html")


# Step 7: Add Topic Assignments and Topic Names to DataFrame
df['Topic'] = topics  # Add topic IDs to the DataFrame
df['Topic Name'] = df['Topic'].map(lambda x: topic_names[x])  # Add topic names based on the top words

# Select relevant columns for the HTML table
table_data = df[['title', 'abstract', 'Topic', 'Topic Name']]

# Step 7: Save the DataFrame as an HTML file
html_file = "papers_with_topics.html"
table_data.to_html(html_file, index=False)

# Optionally, download the HTML file in Google Colab
from google.colab import files
files.download(html_file)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Data fetched from arXiv.


Batches:   0%|          | 0/10 [00:00<?, ?it/s]

2024-11-08 17:07:11,735 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


BERT embeddings generated.


2024-11-08 17:07:12,411 - BERTopic - Dimensionality - Completed ✓
2024-11-08 17:07:12,414 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-08 17:07:12,432 - BERTopic - Cluster - Completed ✓
2024-11-08 17:07:12,434 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-08 17:07:12,521 - BERTopic - Representation - Completed ✓
2024-11-08 17:07:12,522 - BERTopic - Topic reduction - Reducing number of topics
2024-11-08 17:07:12,527 - BERTopic - Topic reduction - Reduced number of topics from 14 to 14
295it [00:05, 55.96it/s]


Assigned topics: [11, 5, -1, 5, 5, 6, 5, -1, 5, 4, 12, -1, 6, 5, 6, 6, -1, 1, 11, 5, 11, 5, 6, 5, 5, 5, 12, 5, 1, 5, 12, 5, -1, 1, 9, 5, 5, 4, 6, -1, 10, -1, -1, 8, 10, 8, 3, 5, 5, 0, 5, 5, -1, -1, 4, 6, 1, 0, 5, -1, 0, 5, 1, 4, 1, 5, -1, 2, 5, -1, 6, 10, 2, 5, -1, -1, -1, 6, 5, 5, 6, 6, -1, 6, 1, 5, 3, -1, 6, 12, 6, 6, 2, 0, 5, -1, 4, 5, 5, 3, -1, 7, -1, -1, 6, -1, -1, 6, -1, 7, -1, 6, 6, 7, 5, 8, 1, -1, 1, 2, 1, 1, 1, 7, -1, 1, 7, 0, 12, -1, 12, 3, 6, 6, 12, 0, 6, 3, -1, -1, 11, 1, 1, 6, 0, 1, 1, 0, 6, 3, -1, -1, 0, -1, 6, 1, 2, 6, -1, 1, -1, 1, 6, 7, 5, 10, 5, 6, 1, 1, 4, 0, 1, -1, 5, -1, -1, 0, 1, 9, 6, 6, 10, 1, -1, 1, 10, -1, 5, -1, -1, 2, 6, 1, 6, 7, -1, -1, 8, -1, 1, 9, 5, -1, 8, -1, -1, 1, 1, 1, 1, 1, 5, 12, 3, 6, 6, -1, 6, 1, 11, 3, 5, 10, -1, 0, 6, 5, 6, 1, -1, 6, 7, -1, 2, 5, 9, 6, 7, 9, 5, -1, 7, 10, 6, 6, 3, 6, 1, 5, 1, 10, 1, 7, 4, -1, 10, 8, 6, 1, 2, 10, 5, 5, 2, 10, 9, -1, 4, 9, 3, -1, -1, 10, -1, 9, 5, 4, 8, 8, -1, 4, 3, 1, 4, 9, -1, 5, 1, 10, -1, -1, 9, 5, 7]
Unique 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>